## The Social Web 2020 - VU course
Part of the sentiment and wordfequency analysis and visualisation for 'A Sentiment Analysis of Tweets Regarding the LGBT Community in Different Countries'.

Programmed by Sarah van Gerwen

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as gojsn
import plotly.express as px
import matplotlib.pyplot as plt
import string
import requests
import json
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from plotly.subplots import make_subplots
from PIL import Image

ALL TWEETS: Sentiment visualization + chi-squared analysis

In [None]:
def main_sentiment(sentned, sentcan, sentsau, sentrus):
    #takes sent files for ned, can, sau and rus with format: {"positive": positive, "positive_proc": positive_percentage, "negative": negative, "negative_proc": negative_percentage, "neutral": neutral, "neutral_proc": neutral_percentage, "total": total}
    #open file
    sentned = open_data(sentned)
    sentcan = open_data(sentcan)
    sentsau = open_data(sentsau)
    sentrus = open_data(sentrus)
    
    #visualization
    sentiment_visualisation(sentned, sentcan, sentsau, sentrus)
    
    #chi squared analysis x 4
    df = pd.DataFrame.from_dict([sentned, sentcan, sentsau, sentrus]).T.rename(columns={0: 'The Netherlands', 1: 'Canada', 2: 'Saudi Arabia', 3: 'Russia'}).loc[['positive','negative','neutral', 'total']]
    
    #Experimen 1: all countries:
    df_exp1 = df
    print ('Sentiment chi-squared 1: Difference all countries')
    chi2analysis(df_exp1)
    
    #Experiment 2: free vs non-free:
    df_exp2 = df
    df_exp2['free'] = df['The Netherlands'] + df['Canada']
    df_exp2['nonfree'] = df['Saudi Arabia'] + df['Russia']
    df_exp2 = df_exp2[['free', 'nonfree']]
    print ('Sentiment chi-squared 2: Difference free vs non-free')
    chi2analysis(df_exp2)
    
    #Experiment 3: all English:
    df_exp3 = df[['The Netherlands', 'Canada', 'Saudi Arabia']]
    print ('Sentiment chi-squared 3: Difference English')
    chi2analysis(df_exp3)
    
    #Experiment 4: all native:
    df_exp4 = df[['Canada', 'Russia']]
    print ('Sentiment chi-squared 4: Difference native')
    chi2analysis(df_exp4)

def open_data(data):
    #open file and return dictionary data
    with open(data) as file:
        dict_data = json.load(file)
    return dict_data
    
def sentiment_visualisation(sentned, sentcan, sentsau, sentrus):
    #prints pie charts and bar chart
    
    #pie chart
    df = pd.DataFrame.from_dict([sentned, sentcan, sentsau, sentrus]).T.rename(columns={0: 'The Netherlands', 1: 'Canada', 2: 'Saudi Arabia', 3: 'Russia'}).loc[['positive','negative','neutral']]
    pie_chart(df)
    
    #bar chart
    df = pd.DataFrame.from_dict([sentned, sentcan, sentsau, sentrus]).rename(index={0: 'The Netherlands', 1: 'Canada', 2: 'Saudi Arabia', 3: 'Russia'}).loc[:,['positive_proc', 'negative_proc', 'neutral_proc']]
    df = df.replace({'%':''}, regex = True)
    df = df.apply(pd.to_numeric)
    barplot(df)
    
def pie_chart(df):
    labels = ['positive', 'negative', 'neutral']
    country_list = ['The Netherlands', 'Canada', 'Saudi Arabia', 'Russia']
    
    for country in country_list:
        if country == 'The Netherlands':
            color_discrete_map = {'positive':'cornsilk','neutral':'orange','negative':'orangered'}
        elif country == 'Canada':
            color_discrete_map = {'positive':'lawngreen', 'neutral':'limegreen', 'negative':'darkgreen'}
        elif country == 'Saudi Arabia':
            color_discrete_map = {'positive':'lightcoral', 'neutral':'crimson', 'negative':'darkred'}
        elif country == 'Russia':
            color_discrete_map = {'positive':'lightcyan', 'neutral':'royalblue', 'negative':'darkblue'}
    
        piechrt = px.pie(df, values = country, names = labels, color = labels, color_discrete_map=color_discrete_map)
        piechrt.update_traces(textposition='inside', textinfo='percent+label', showlegend=False)
        piechrt.show()
        
def barplot(df):
    barplot = df.plot.bar(color=['green', 'red', 'grey'])
    fig = barplot.get_figure()
    fig.savefig('bar_sent.png')

def chi2analysis(df):
    #get the chi squared result with scipy
    table = [[df.loc['positive'].tolist()], [df.loc['negative'].tolist()], [df.loc['neutral'].tolist()]]
    chi, p, dof, ex = chi2_contingency(table)
    
    print ('chisquared= ' + str(chi))
    print ('crit= ' + str(chi2.ppf(0.95, dof)))
    print ('p= ' + str(p))
    print('dof= ' + str(dof))

In [None]:
main_sentiment('tweets-netherlands-2020-03-14_19_59_49_sentiments.json', 'tweets-canada-2020-03-14_19_36_20_sentiments.json',
               'tweets-saudiarabia-2020-03-14_20_54_54_sentiments.json', 'tweets-russia_sentiments.json')

ALL TWEETS: Word frequency analysis + sent analysis with VADER + visualization + chi-squared analysis

In [None]:
def word_frequency_analysis(rawbatchned, rawbatchcan, rawbatchsau, rawbatchrus):
    #open file (Format json {"id": id, "id_str": id_str, "text": text, etc}) 
    #do sentiment analysis with VADER for ned can and sau
    dfned = open_file_sent(rawbatchned)
    dfcan = open_file_sent(rawbatchcan)
    dfsau = open_file_sent(rawbatchsau)

    #open file, translate russian tweets, apply VADER ## in this case, only the first 82 Tweets were selected
    dfrus = pd.read_json(rawbatchrus, lines=True, encoding='utf8')
    dfrus = translation(dfrus)
    dfrus = dfrus.iloc[:82]
    dfrus = VADER_sentiment(dfrus)
    
    #name dfs to keep track
    dfnedname = 'ned'
    dfcanname = 'can'
    dfsauname = 'sau'
    dfrusname = 'rus'
    
    #get frequencies of words and make wordclouds
    dfned = get_frequencies(dfned, dfnedname)
    dfcan = get_frequencies(dfcan, dfcanname)
    dfsau = get_frequencies(dfsau, dfsauname)
    dfrus = get_frequencies(dfrus, dfrusname)
    
    #do the different chi-squared experiments
    between_comparison_wordfreq(dfrus, dfsau, dfcan, dfned, dfrusname, dfsauname, dfcanname, dfnedname)

def open_file_sent(data):
    #adds sentiment score, returns new df
    #open file with format json {"id": id, "id_str": id_str, "text": text, etc}
    df = pd.read_json(data, lines=True, encoding='utf8')
    #sentiment analysis
    df = VADER_sentiment(df)
    
    return df

def VADER_sentiment(df):
    #use VADER to get sentiment scores -> https://github.com/cjhutto/vaderSentiment for thresholds and more info
    #return dataframe with all information + column with sentiment
    sentiment = []
    sid = SentimentIntensityAnalyzer()
    
    for tweet in df['text']:
        VADER_score = sid.polarity_scores(tweet)
        for key,value in VADER_score.items():
            if key == 'compound':
                if value >= 0.05:
                    sentiment.append('positive')
                if value <= -0.05:
                    sentiment.append('negative')
                if value > -0.05 and value < 0.05:
                    sentiment.append('neutral')
                    
    df['sentiment'] = sentiment
    return (df)
    
def translation(df):
    #adapted from https://github.com/cjhutto/vaderSentiment/blob/master/README.rst#demo-including-example-of-non-english-text-translations
    #return df with translated Tweets instead of original Tweets
    
    sentences_translated = []
    
    for tweet in df['text']:
        api_url = "http://mymemory.translated.net/api/get?q={}&langpair={}|{}".format(tweet, 'ru', 'en')
        hdrs ={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Accept-Encoding': 'none',
                'Accept-Language': 'en-US,en;q=0.8',
                'Connection': 'keep-alive'}
        response = requests.get(api_url, headers=hdrs)
        response_json = json.loads(response.text)
        translation = response_json["responseData"]["translatedText"]
        translator_name = "MemoryNet Translation Service"
        sentences_translated.append(translation)
        
    df['text'] = sentences_translated
    return df

def get_frequencies(df, dfname):
    #takes df and tokenizes + tagges a tokenized tweet, checks whether tokens are in labelled tweet, preprocesses the resulting data
    #returns df with labelled words
    sent_dict = dict()
    
    for i in range(df.shape[0]):
        #tokenize tweet and tag the words with part-of-speech tag
        tokenized_tweet = nltk.word_tokenize(df['text'].iloc[i])
        tagged_tweets = nltk.pos_tag(tokenized_tweet)
        sent = df['sentiment'].iloc[i]
        for word, tag in tagged_tweets:
            #only take nouns
            if tag == 'NN':
                if word not in sent_dict:
                    sent_dict[word] = {'positive' : 0, 'negative' : 0, 'neutral' : 0, 'total' : 0}
                sent_dict[word][sent] += 1
                sent_dict[word]['total'] += 1
    
    #preprocess data
    sent_dict = wordfreq_preprocessing(sent_dict)
    
    #add overall label
    df = pd.DataFrame(sent_dict).transpose()
    df['label_context'] = df[['positive','negative', 'neutral']].idxmax(axis=1)
        
    #prints five most frequent entries for each label
    printedlist = ['positive', 'negative', 'neutral', 'total']
    for label in printedlist:
        df = df.sort_values(by=[label], ascending = False)
        print ('Five most frequent %s words from %s' % (label, dfname))
        print (df.head(5))
    
    #make wordcloud
    wordcloud(df, dfname, 'total')
    
    return (df)
    
def wordfreq_preprocessing(freq):
    #takes frequencies in dictionary form, removes interpunction/words/upper-lower difference
    #returns frequencies in dict
    #remove punctuation and selected words
    no = ["n't", 'I', '’', "'s", '...', '``', "''", '“', '”', "'m", 'Is', 'In', 'It', 'Are', 'https', 'The', 'a', 'And', 'The', 'A', 'Of', 't', 'For', 'If', 'By', 'An','So', 'To', 'However', 'Whence', 'Whenever', 'Where', 'Whereby', 'Whereever', 'Wherein', 'Whereof', 'That', 'What', 'Whatever', 'Which', 'Whichever']
    for entry in no:
        if entry in freq:
            del freq[entry]
        elif entry.lower() in freq:
            del freq[entry.lower()]
        elif entry.upper() in freq:
            del freq[entry.upper()]
    for punc in string.punctuation:
        if punc in freq:
            del freq[punc]
    
    #remove lower-uppercase difference
    list_of_dup = []
    for word in freq:
        if word in freq and word.lower() in freq:
            list_of_dup.append(word)
    for dup in list_of_dup:
        if dup == dup.upper():
            for scores in freq[dup]:
                freq[dup][scores] = freq[dup][scores] + freq[dup.lower()][scores]
            del freq[dup.lower()]
        elif dup != dup.upper() and dup != dup.lower():
            try:
                for scores in freq[dup]:
                    freq[dup.lower()][scores] = freq[dup][scores] + freq[dup.lower()][scores]
                del freq[dup]
            except:
                pass
    return freq
                
            
def wordcloud(df, whichmask, whichone):
    #check https://github.com/amueller/word_cloud for details
    #takes df, whichmask : ('can','rus', 'ned', 'sau', 'free', 'nonfree', 'comb'), whichone : columns
    #take masked images in different colours and outputs a wordcloud
    d = df[whichone].to_dict()
    
    if whichmask == 'can':
        maskedimage = 'cangreen.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    elif whichmask == 'ned':
        maskedimage = 'nedorange.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    elif whichmask == 'sau':
        maskedimage = 'saured.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    elif whichmask == 'rus':
        maskedimage = 'rusblue.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    elif whichmask == 'comb':
        maskedimage = 'togetherpurp.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    elif 'nonfree' in whichmask:
        maskedimage = 'nonfreedarkred.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    else:
        maskedimage = 'freedarkgreen.png'
        out = str(whichone) + str(whichmask) + '_wordcloud.png'
    
    
    mask = np.array(Image.open(maskedimage))
    image_colors = ImageColorGenerator(mask)
    wordcloud = WordCloud(background_color="white", mask = mask)
    wordcloud.generate_from_frequencies(frequencies=d)
    wordcloud.recolor(color_func=image_colors).to_file(out)
    plt.figure()
    plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
def between_comparison_wordfreq(rus,sau,can,ned,namerus,namesau,namecan,namened):
    #EXPERIMENT 1: OUTER OUTER INNER
    #experiment 1.1: all differences - positive - negative - neutral - total
    print ('\n EXPERIMENT 1 Intersection \n')
    print ('\n 1.1 All differences \n')
    df = config(rus,sau,can,ned,True,False,False,False)
    labels_and_chisquared(df,True,False,False,False)
    
    #experiment 1.2: free vs nonfree - positive - negative - neutral - total
    print ('\n 1.2 Free non-free \n')
    df = config(rus,sau,can,ned,True,False,False,True)
    labels_and_chisquared(df,False,False,False,True)
    
    #experiment 1.3: English - positive - negative - neutral - total
    print ('\n 1.3 English \n')
    df = config(rus,sau,can,ned,True,True,False,False)
    labels_and_chisquared(df,False,True,False,False)
    
    #experiment 1.4: Native - positive - negative - neutral - total
    print ('\n 1.4 Native \n')
    df = config(rus,sau,can,ned,True,False,True,False)
    labels_and_chisquared(df,False,False,True,False)
    
    #EXPERIMENT 2: OUTER OUTER OUTER
    #experiment 2.1: all differences - positive - negative - neutral - total
    print ('\n EXPERIMENT 2 Union \n')
    print ('\n 2.1 All differences \n')
    df = config(rus,sau,can,ned,False,False,False,False)
    labels_and_chisquared(df,True,False,False,False)
    
    #experiment 2.2: free vs nonfree - positive - negative - neutral - total
    print ('\n 2.2 Free non-free \n')
    df = config(rus,sau,can,ned,False,False,False,True)
    labels_and_chisquared(df,False,False,False,True)
    
    #experiment 2.3: English - positive - negative - neutral - total
    print ('\n 2.3 English \n')
    df = config(rus,sau,can,ned,False,True,False,False)
    labels_and_chisquared(df,False,True,False,False)
    
    #experiment 2.4: Native - positive - negative - neutral - total
    print ('\n 2.4 Native \n')
    df = config(rus,sau,can,ned,False,False,True,False)
    labels_and_chisquared(df,False,False,True,False)
    
    #EXPERIMENT 3: TOPIC MODELLING
    print ('\n EXPERIMENT 3 Topics of free vs non-free \n')
    print ('\n 3.3 Combined words \n')
    freenonfree_freq_clouds(ned,namened,can,namecan,'outer','freeouter')
    freenonfree_freq_clouds(sau,namesau,rus,namerus,'outer','nonfreeouter')
    df = config(rus,sau,can,ned,False,False,False,False)
    df['togetherouter'] = df['total_ned'] + df['total_can'] + df['total_rus'] + df['total_sau']
    wordcloud(df, 'comb', 'togetherouter')
    
    print ('\n 3.3 Combined intersected words \n')
    freenonfree_freq_clouds(ned,namened,can,namecan,'inner','freeinner')
    freenonfree_freq_clouds(sau,namesau,rus,namerus,'inner','nonfreeinner')
    df = config(rus,sau,can,ned,True,False,False,False)
    df['togetherinner'] = df['total_ned'] + df['total_can'] + df['total_rus'] + df['total_sau']
    wordcloud(df, 'comb', 'togetherinner')
    
    print ('\n 3.3 Unique words \n')
    df = config(rus,sau,can,ned,False,False,False,False)
    unique_words(df)
    
def config(rus,sau,can,ned,inner,english,native,groups):
    #take all dfs, inner: True/False, english: True/False, native: True/False, groups: True/False configure new dataframes
    #return new df
    
    if english == False and native == False and groups == False:
        if inner == True:
            df_merges1 = ned.merge(can, how='inner', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df_merges2 = rus.merge(sau, how='inner', left_index = True, right_index = True, suffixes=('_rus', '_sau'))
            df = df_merges1.merge(df_merges2, how='inner', left_index = True, right_index = True, suffixes=('', '')).fillna(0)
        else:
            df_merges1 = ned.merge(can, how='outer', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df_merges2 = rus.merge(sau, how='outer', left_index = True, right_index = True, suffixes=('_rus', '_sau'))
            df = df_merges1.merge(df_merges2, how='outer', left_index = True, right_index = True, suffixes=('', '')).fillna(0)

    elif groups == True:
        if inner == True:
            df_merges1 = ned.merge(can, how='outer', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df_merges2 = rus.merge(sau, how='outer', left_index = True, right_index = True, suffixes=('_rus', '_sau'))
            df = df_merges1.merge(df_merges2, how='inner', left_index = True, right_index = True, suffixes=('', '')).fillna(0)
       
        else:
            df_merges1 = ned.merge(can, how='outer', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df_merges2 = rus.merge(sau, how='outer', left_index = True, right_index = True, suffixes=('_rus', '_sau'))
            df = df_merges1.merge(df_merges2, how='outer', left_index = True, right_index = True, suffixes=('', '')).fillna(0)
       
    elif english == True:
        if inner == True:
            df_merges1 = ned.merge(can, how='inner', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df = df_merges1.merge(sau, how='inner', left_index = True, right_index = True, suffixes=('', '')).fillna(0)
        else:
            df_merges1 = ned.merge(can, how='outer', left_index = True, right_index = True, suffixes=('_ned', '_can'))
            df = df_merges1.merge(sau, how='outer', left_index = True, right_index = True, suffixes=('', '')).fillna(0)
    
    elif native == True:
        if inner == True:
            df = can.merge(rus, how='inner', left_index = True, right_index = True, suffixes=('_can', '_rus')).fillna(0)
        else:
            df = can.merge(rus, how='outer', left_index = True, right_index = True, suffixes=('_can', '_rus')).fillna(0)
         
    return df
    
def labels_and_chisquared(df,entire,english,native,groups):
    #take configurated df, inner: True/False, english: True/False, native: True/False, groups: True/False configure new dataframes
    #creates new dfs based on labels and performs the chisquared analysis
    labels = ['positive', 'negative', 'neutral', 'total']
    for label in labels:
        if entire == True:
            for word,value in df.iterrows():
                if value[(label + '_ned')] == 0 and value[(label + '_can')] == 0 and value[(label + '_rus')] == 0 and value[(label + '_sau')] == 0:
                    df = df.drop(index=word)
            table = [ [df[(label + '_ned')].tolist()], [df[(label + '_can')].tolist()], [df[(label + '_rus')].tolist()], [df[(label + '_sau')].tolist()]]
            print ('\n Chi squared results for label %s \n' % (label))
            chi2analysis(table)    
        elif groups == True:
            df['free'] = df[(label + '_ned')]+df[(label + '_can')]
            df['nonfree'] = df[(label + '_rus')]+df[(label + '_sau')]
            for word,value in df.iterrows():
                if value[(label + '_ned')] == 0 and value[(label + '_can')] == 0 and value[(label + '_rus')] == 0 and value[(label + '_sau')] == 0:
                    df = df.drop(index=word)
            table = [ [df['free'].tolist()], [df['nonfree'].tolist()]]
            print ('\n Chi squared results for label %s \n' % (label))
            chi2analysis(table)
        elif english == True:
            for word,value in df.iterrows():
                if value[(label + '_ned')] == 0 and value[(label + '_can')] == 0 and value[label] == 0:
                    df = df.drop(index=word)
            table = [ [df[(label + '_ned')].tolist()], [df[(label + '_can')].tolist()], [df[label].tolist()] ]
            print ('\n Chi squared results for label %s \n' % (label))
            chi2analysis(table)
        elif native == True:
            for word,value in df.iterrows():
                if value[(label + '_can')] == 0 and value[(label + '_rus')] == 0:
                    df = df.drop(index=word)
            table = [ [df[(label + '_can')].tolist()], [df[(label + '_rus')].tolist()] ]
            print ('\n Chi squared results for label %s \n' % (label))
            chi2analysis(table)
                
def chi2analysis(table):
    chi, p, dof, ex = chi2_contingency(table)
    
    print ('chisquared= ' + str(chi))
    print ('crit= ' + str(chi2.ppf(0.95, dof)))
    print ('p= ' + str(p))
    print('dof= ' + str(dof))

def freenonfree_freq_clouds(country1,name1,country2,name2,how,columnname):
    df = country1.merge(country2, how=how, left_index = True, right_index = True, suffixes=(('_'+name1), ('_'+name2))).fillna(0)
    df[columnname] = df[('total_'+name1)] + df[('total_'+name2)] 
    
    df = df.sort_values(by=[columnname], ascending = False)
    print ('\n Most frequent 25 words for %s \n' % (columnname))
    print (df.head(25))
    
    wordcloud(df, columnname, columnname)

def unique_words(df):
    #takes df, generates unique words and makes wordclouds out of them
    df['free'] = df['total_ned'] + df['total_can']
    df['nonfree'] =  df['total_rus'] + df['total_sau']
    df_free = df
    df_nonfree = df 
    
    for word,value in df.iterrows():
        if value['free'] != 0 or '/' in word:
            df_nonfree = df_nonfree.drop(index=word)
        if value['nonfree'] != 0 or '/' in word:
            df_free = df_free.drop(index=word)
    
    df_free = df_free.sort_values(by=['free'], ascending = False)
    df_nonfree = df_nonfree.sort_values(by=['nonfree'], ascending = False)
    
    print ('\n Most frequent 25 unique words for the free countries \n')
    print (df_free.head(25))
    wordcloud(df_free, 'free', 'free')
    print ('\n Most frequent 25 unique words for the non-free countries \n')
    print (df_nonfree.head(25))
    wordcloud(df_nonfree, 'nonfree', 'nonfree')

In [None]:
word_frequency_analysis('tweets-netherlands-2020-03-14_19_59_49', 'tweets-canada-2020-03-14_19_36_20', 
                        'tweets-saudiarabia-2020-03-14_20_54_54', 'tweets-russia-2020-03-14_16_37_32')