# Pre-processing 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/Tilburg DSS/Thesis/Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
pip install anonymizedf
from anonymizedf.anonymizedf import anonymize

In [None]:
df=pd.read_excel("EUGreenDeal.xlsx") #Collected Dataset

In [None]:
df=df[(df.language=="en")]

In [None]:
import re 
def clean_tweet(r):
    r = re.sub(r'http\S+', '', r)
    return r

In [None]:
tweet_list = df.tweet.to_list()

In [None]:
cleaned_url = [clean_tweet(tw) for tw in tweet_list]
df["cleaned_url"] =cleaned_url

In [None]:
df.drop_duplicates(subset='cleaned_url',inplace=True)

In [None]:
columns=['id', 'date',
       'tweet', 'language', 'hashtags', 'user_id',
       'username','cleaned_url']

In [None]:
df1=df[columns]

In [None]:
df1 = df1.reset_index(drop=True)

In [None]:
df2=df1

#### Anonimizing the Users

In [None]:
df_an = anonymize(df2)
anon_df = (
    df_an
    .fake_categories("username", chaining=True)
    .show_data_frame()
)

In [None]:
df2=anon_df

### User sampling & Daily Number of Tweet Plots

In [None]:
df2["Fake_username"].value_counts()
plt.title("# of tweets per users")
plt.xlabel("users")
plt.ylabel("# of tweets (%)")
plt.savefig('Users_before.png', bbox_inches='tight', dpi=300, format='png')
d=df2["Fake_username"].value_counts()
e=round(d/sum(df2["Fake_username"].value_counts())*100,2)
e[:20].plot(kind="bar", color="darkblue")
plt.show()
df2[(df2.Fake_username=="username 1")]
df2
df3=df2[(df2.Fake_username=="username 1")].sample(n=500, random_state=1)
df4=df2[(df2.Fake_username!="username 1")]
df4["Fake_username"].value_counts()
frames=[df3,df4]
df5=pd.concat(frames)
b=df5["Fake_username"].value_counts()
b
c=round(b/sum(df5["Fake_username"].value_counts())*100,2)

plt.title("# of tweets per users")
plt.xlabel("users")
plt.ylabel("# of tweets (%)")
c[:20].plot(kind="bar", color="darkblue")
plt.ylim(0, 4)
plt.show()
plt.savefig('Users_after.png', bbox_inches='tight', dpi=300, format='png')

df5.nunique()
df5['Dates'] = pd.to_datetime(df5['date']).dt.date
df5['Time'] = pd.to_datetime(df5['date']).dt.time


df2['Dates'] = pd.to_datetime(df2['date']).dt.date
df2['Time'] = pd.to_datetime(df2['date']).dt.time

# reference: https://gist.github.com/gdsaxton/3d4934c61a435768e687d01aa3f46b4a
def tweets_per_day(df):
    df['Dates'] = pd.to_datetime(df['Dates'])
    return df[['tweet']].groupby(df['Dates'].dt.date).count()
  

Tweets_count=tweets_per_day(df2)
Tweets_count.to_excel("daily_tweets.xlsx")
from matplotlib import pyplot as plt, dates as mdates
plt.figure(figsize=(12,6))
daily_plot = Tweets_count['tweet'].plot(kind='line', lw=1, alpha=0.75, legend=True, x_compat=True)
daily_plot.set_xlabel('Date', weight='bold', labelpad=15)   
daily_plot.set_ylabel('# Tweets', weight='bold', labelpad=15) 
plt.xticks(fontsize = 10, rotation = -40, ha ="left")  
plt.yticks(fontsize = 10)                              

daily_plot.legend_ = None
daily_plot.tick_params(axis='x', pad=5)

Tweets_count=tweets_per_day(df5)
Tweets_count.to_excel("daily_tweets.xlsx")
from matplotlib import pyplot as plt, dates as mdates
plt.figure(figsize=(12,6))
daily_plot = Tweets_count['tweet'].plot(kind='line', lw=1, alpha=0.75, legend=True, x_compat=True)
daily_plot.set_xlabel('Date', weight='bold', labelpad=15)   
daily_plot.set_ylabel('# Tweets', weight='bold', labelpad=15) 
plt.xticks(fontsize = 10, rotation = -40, ha ="left") 
plt.yticks(fontsize = 10)                           


daily_plot.legend_ = None
daily_plot.tick_params(axis='x', pad=5)
plt.savefig('daily counts.png', bbox_inches='tight', dpi=300, format='png') 


 

### Text Cleaning

In [None]:
df5['tweet_1'] = df5.cleaned_url.str.lower()
df5
df5['tweet_2']=df5.tweet_1.apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))
df5
df5['tweet_3']=df5.tweet_2.apply(lambda x: re.sub(r'{link}', '', x))
df5['tweet_3']=df5.tweet_3.apply(lambda x: re.sub(r"\[video\]", '', x))


# reference: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490

def decontracted(phrase):
  
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
 
    phrase = re.sub(r"n't", " not", phrase)
    phrase = re.sub(r"'re", " are", phrase)
    phrase = re.sub(r"'s", " is", phrase)
    phrase = re.sub(r"'d", " would", phrase)
    phrase = re.sub(r"'ll", " will", phrase)
    phrase = re.sub(r"'t", " not", phrase)
    phrase = re.sub(r"'ve", " have", phrase)
    phrase = re.sub(r"'m", " am", phrase)
    phrase = re.sub(r"'em", " them", phrase)
    return phrase




In [None]:
df5['tweet_4'] = df5.tweet_3.apply(lambda x: decontracted(x))
df5['tweet_5'] = df5.tweet_4.apply(lambda x: re.sub(r'&[a-z]+;', '', x)) 

df5['tweet_6'] = df5.tweet_5.apply(lambda x: re.sub(r"@[A-Za-z0-9_]+","", x))
df5['tweet_7'] = df5.tweet_6.apply(lambda x: re.sub(r"#[A-Za-z0-9_]+","", x))
df5['tweet_8'] = df5.tweet_7.apply(lambda x: re.sub(r"[^A-Za-z0-9\s]+", "", x))

df5['tweet_9'] = df5.tweet_8.apply(lambda x: re.sub(r"\b[0-9]+\b\s*", "", x))
df5['tweet_9'] = df5.tweet_9.apply(lambda x: re.sub(r'\w*\d\w*', '', x))

df5['tweet_10'] = df5.tweet_9.apply(lambda x:re.sub(r'(.)\1{3,}',r'\1', x)) #Remove repeated characters
df5['tweet_11'] = df5.tweet_10.apply(lambda x: re.sub(r' +',' ',x).strip())

def tokenization(text):
    text = re.split('\W+', text)
    return text

df5['Tweet_tokenized'] = df5['tweet_11'].apply(lambda x: tokenization(x))
df5=df5.reset_index(drop=True)



In [None]:
# reference: https://github.com/nltk/nltk
!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')


###  Spell Checking

In [None]:
!pip install pyspellchecker
from spellchecker import SpellChecker


In [None]:
# reference: https://gist.github.com/ghadj/507e53effcf7fa9e873b3ed485723527

spell = SpellChecker()
spell.word_frequency.load_words(['url']) 
 
a=[]
for i in range(0,len(df5.Tweet_tokenized)):
  processed_tweet=[]
  for word in df5.Tweet_tokenized[i]:
        # Replaced misspelled with the one most likely answer
        processed_tweet.append(spell.correction(word) if word in spell.unknown(df5.Tweet_tokenized[i]) else word)
  a.append(" ".join(map(str,processed_tweet)))
print(a)

In [None]:
df_processed=pd.DataFrame(a, columns=["spelled_checked"])

In [None]:
df_processed.spelled_checked=df_processed.spelled_checked.str.lower()

In [None]:
frames2=[df5,df_processed]
df6=pd.concat(frames2, axis=1)

In [None]:
df6.to_pickle("df6_.pkl")

In [None]:
def short_words(text):
    # remove words between 1 and 2
    shortword = re.compile(r'\W*\b(?!no)\w{1,2}\b')
    return shortword.sub('', text)

In [None]:
df6["wo_shortwords"]=df6.spelled_checked.apply(lambda x: short_words(x))

In [None]:
from nltk.corpus import stopwords
additional  = ["green", "europe", "european", "via", "moi", "none"]
swords = set().union(stopwords.words('english'),additional)

In [None]:
swords.remove("not") #we exclude not, but, no from the stopwords corpus since removing not from the text will change the context of the text
swords.remove("but")
swords.remove("no")

In [None]:
df6["wo_stopwords_2"] = df6.wo_shortwords.apply(lambda x: [i for i in x.split() if not i in swords])

### Lemmatizing 

In [None]:
from nltk.stem import WordNetLemmatizer

wordnet_lem = WordNetLemmatizer()

def lemm(text):
    text = [wordnet_lem.lemmatize(word) for word in text]
    return text

df6['tweet_lem'] = df6.wo_stopwords_2.apply(lambda x: lemm(x))

In [None]:
df6.to_pickle("df6_processed.pkl") #save the DataFrame as a pickle file: