# Imports

In [None]:
import time
import string
import re
import nltk
import gensim
import gensim.corpora as corpora
import ujson as json
import pandas as pd
import collections
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
import ast
import glob
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Data preparation

## Import all parler data

In [None]:
parler_files = glob.glob('./parler_data_csv/*.csv')

In [None]:
parler_df = parler_df.dropna(subset=['body', 'bodywithurls']) # Removes empty messages
parler_df = parler_df[parler_df.createdAtformatted > '2020-01-01'] # Keep messages from 2020

In [None]:
parler_df['createdAtformatted'] = pd.to_datetime(parler_df['createdAtformatted']) # Convert to date object

In [None]:
parler_df['date'] = parler_df['createdAtformatted']
parler_df['date'] =  parler_df['date'].dt.date

parler_hashtags_df = parler_df[parler_df.body.str.contains('#')].copy() # Select columns containing hashtages
# convert hashtag column to list

In [None]:
parler_df_hashtags = parler_hashtags_df[['date','body']]
parler_df_hashtags

## Extract hashtags

In [None]:
def aggregate_df_hashtag_count(parler_df):
    parler_df = parler_df.dropna(subset=['body', 'bodywithurls'])
    parler_df = parler_df[parler_df.createdAtformatted > '2020-01-01']
    parler_df['createdAtformatted'] = pd.to_datetime(parler_df['createdAtformatted'])
    parler_df['date'] = parler_df['createdAtformatted']
    parler_df['date'] =  parler_df['date'].dt.date
    parler_hashtags_df = parler_df[parler_df.body.str.contains('#')].copy()
    parler_hashtags_df['hashtags'] = parler_hashtags_df.hashtags.str.lower().apply(lambda s: list(ast.literal_eval(s)))
    parler_df_hashtags = parler_hashtags_df[['date','body']]
    agg = parler_df_hashtags.groupby('date').apply(lambda x: x.body.str.extractall(r'(\#\w+)')[0].value_counts()).to_frame()
    agg = agg.reset_index().rename(columns = {'level_1': 'hashtag',0:'value'})
    return agg

For each CSV file we group by date and hashtag, by counting the number of time the hashtag is used. We then export them as `agg_{i}.csv`

In [None]:
file_list = glob.glob('./parler_data_csv/*.csv')
for i, file in enumerate(glob.glob('./parler_data_csv/*.csv')):
    print(f'aggregating_data for file {file} ({i+1}/{len(file_list)})')
    df = pd.read_csv(file)
    agg = aggregate_df_hashtag_count(df)
    agg.to_csv(f'agg_{i}.csv')

# Aggregating each file 

In [None]:
dfs = []
for file in glob.glob('agg*.csv'):
    df = pd.read_csv(file)
    print(df.shape)
    df = df.sort_values('value',ascending=False)
#     We drop hashtags that are automatic when joining Parler
    df = df[df['hashtag'] != '#parlerconcierge']
    df = df[df['hashtag'] != '#Parler']
    df = df[df['hashtag'] != '#parler']
    df = df[df['hashtag'] != '#parlerConcierge']
    df = df[df['hashtag'] != '#ParlerConcierge']
    df = df[df['hashtag'] != '#newuser']
    df = df.drop('Unnamed: 0', axis=1)
    dfs.append(df)

In [None]:
agg_df = pd.concat(dfs, axis=0)

# Final aggregation 

In [None]:
agg_df['hashtag'] = agg_df['hashtag'].apply(lambda x:str(x).lower()) # We lower each hashtag before the final aggregation

## Grouping by day 

In [None]:
agg_df = agg_df.groupby(['date', 'hashtag']).sum().reset_index()

In [None]:
agg_df.sort_values('value', ascending=False)

In [None]:
low_hashtags = agg_df.groupby(['hashtag']).sum()

## Remove hashtags that are not used much

In [None]:
low_hashtags = low_hashtags[low_hashtags['value']<5000].sort_values('value', ascending=False).reset_index()

In [None]:
to_drop = low_hashtags.hashtag.unique()

In [None]:
agg_export = agg_df[~agg_df['hashtag'].isin(to_drop)]

In [None]:
agg_export['date'] = pd.to_datetime(agg_export['date']) # Convert to datetime

## Aggregate hashtags per month

In [None]:
df1 = agg_export.groupby([pd.Grouper(key='date',freq='M'), 'hashtag']).sum()
df1 = df1.reset_index()
df1 = df1.rename({'hashtag': 'name'}, axis=1)
df1

# Export

In [None]:
import json
res = json.loads(df1.to_json(orient='records'))


with open('top_hashtags_month.json', 'w+') as outfile:
    json.dump(res, outfile)