In [None]:
'''
File name: titles_analysis.ipynb
Author: Camille Bernelin
Date created: 21/12/2022
Date last modified: 23/12/2022
Python Version: 3.9.13
'''

Global parameters to set for data analysis :

In [2]:
##### RUN FOR PERIOD :

year_start = "2018"
month_start = "12"
year_end = "2019"
month_end = "01"

##### FOCUS ON A WORD

word = ''

##### HOW MANY COMMUNITIES ?

nb_communities = 6

Setting the processing environment :

In [17]:
import pandas as pd
import numpy as np
import spacy
import codecs


import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import math
import os
import glob


# Word occurences count
from collections import Counter

# Sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Dynamic graphs
import plotly.graph_objects as go
from plotly.colors import n_colors
import plotly.express as px

# Emojis management
def is_emoji(s):
    emojis = "😘◼️🔴🤾🎅😂🚒👨🤦" # add more emojis here
    count = 0
    for emoji in emojis:
        count += s.count(emoji)
        if count > 1:
            return False
    return bool(count)

DIR = "data/"
DIR_LARGE = "data/large/"
DIR_OUT = "csv_outputs/"
PATH_METADATA = DIR_LARGE + "yt_metadata_en.jsonl.gz"


############# Data used for processing of titles and tags


communities = range(nb_communities)

nlp = spacy.load('en_core_web_sm')
nlp.max_length=1000000

# Preprocessing
#undesired_expression_list = ["Fox News", "New York Times","NBC News"]
undesired_expression_list = []

# Filtering
start_date = year_start + "-" + month_start + "-01"
period_start = pd.to_datetime(start_date, format='%Y-%m-%d')
end_date = year_end + "-" + month_end + "-31"
period_end =pd.to_datetime(end_date, format='%Y-%m-%d')

# Processing
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
#news_lexical_field = ['news','News','wrap','Day','day','TV','Channel','channel']
news_lexical_field = ['Reports','Report','latest','Latest','Headlines','headlines','Forecast','forecast','Ep','EP','MSNBC','ITV','Breaking','News','news','man','Man','woman','Woman','Men','men','Women','women','Day','day','say','Say','says','Says','SAYS','New','new','year','Year','Call','call','LIVE','Live','live','Video','video','December','January']
my_undesired_list = ['|','l','w/','=','$',word] + news_lexical_field

named_entities = ['LOC','GPE']

Apply the natural language processing to titles, to count the occurences of words, once titles have been cleaned (stopwords removal, elimination of classical journalistic vocabulaty carrying few information, elimination of emojis...).

In [15]:
for selected_commu in communities:
    ############# Select videos whom title contains a given word

    titles = pd.read_csv(DIR_OUT + "titles_date"+str(selected_commu)+".csv",sep=";")

    titles_contain = titles[titles['title'].str.contains(word, case=False)]


    ############# Preprocess the text

    for expression in undesired_expression_list :
        titles_contain.loc['title'] = titles_contain['title'].str.replace(expression,"")

    titles_contain.to_csv(DIR_OUT + "titles_date_f.csv",sep=";",index=False, header=True)
    #titles_contain.to_csv(DIR_OUT+"titles.txt", sep="\n",index=False, header=False)

    ############# Select videos in a time period of interest

    titles = pd.read_csv(DIR_OUT + "titles_date_f.csv",sep=";")
    titles["upload_date"] = pd.to_datetime(titles["upload_date"], format='%Y-%m-%d %H:%M:%S')
    titles = titles[(titles['upload_date']>=period_start) & (titles['upload_date']<=period_end)]

    #titles['title'] = [x.encode('utf-8','ignore').decode("utf-8") for x in titles['title']]
    titles["title"].to_csv(DIR_OUT+"titles_to_process.txt", sep="\n",index=False, header=False)

    ############# Process the titles 

    books = list()

    with codecs.open(os.path.join(DIR_OUT,"titles_to_process.txt"), encoding='utf8') as f:
        books.append(f.read())

    # Remove new lines
    books = [" ".join(b.split()) for b in books]

    doc_processed = []

    # Processing of titles
    for i in range(0,int(len(books[0])/nlp.max_length)+1):

        # Tokenization
        print("Tokenization")
        doc = nlp(books[0][i*nlp.max_length:(i+1)*nlp.max_length])

        # Named entities recognition
        print("Named entities recognition")
        for ent in doc.ents:
            if ent.label_ in named_entities:
                doc_processed = doc_processed + [ent.text]
            
        # Punctuation and stopwords removal
        print("Punctuation and stopwords removal")
        doc_processed = doc_processed + [token.text for token in doc if (not (token.ent_type_ in named_entities) and not token.is_digit and not token.is_stop and not token.is_punct and not (token.text in my_undesired_list) and not is_emoji(token.text))]

        # Removal of undesired characters
        doc_processed = [token.replace(',','') for token in doc_processed]
        doc_processed = [token.replace('.','') for token in doc_processed]
        doc_processed = [token.replace("'s",'') for token in doc_processed]

        print("Output")
        #doc_processed = [x.encode('latin1','ignore').decode("latin1") for x in doc_processed]
        pd.DataFrame(doc_processed).to_csv(DIR_OUT+"titles_words.csv", sep=";",index=False, header=['word'])

    # EXEC = ~2min30 per iteration (Tokenization takes 90% of exec time)
    
    ############# Count words occurences

    PATH_OUT = DIR_OUT+"communities_comparison/titles_occurences_"+str(selected_commu)+"_"+word+"_"+start_date+"_"+end_date+".csv"

    titles_processed = pd.read_csv(DIR_OUT+"titles_words.csv", sep=';')
    #titles_processed = [x.encode('utf-8','ignore').decode("utf-8") for x in titles_processed['word']]

    # Count occurences
    titles_processed_lowercase = [str(word).lower() for word in titles_processed['word']]
    word_freq = Counter(titles_processed_lowercase)
    common_words = word_freq.most_common()
    common_words_out = pd.DataFrame(common_words)
    common_words_out.columns=['word','occurences']

    common_words_out.insert(2, column='frequency', value=common_words_out['occurences']/common_words_out['occurences'].sum())

    display("Community " + str(selected_commu))
    common_words_out.to_csv(PATH_OUT,sep=';')
    display(common_words_out.head(20))


'Community 0'

Unnamed: 0,word,occurences,frequency
0,police,3384,0.007006
1,pm,2351,0.004867
2,fire,2124,0.004397
3,trump,2063,0.004271
4,weather,1945,0.004027


'Community 1'

Unnamed: 0,word,occurences,frequency
0,trump,2150,0.033171
1,president,643,0.009921
2,shutdown,421,0.006495
3,us,404,0.006233
4,donald,365,0.005631


'Community 2'

Unnamed: 0,word,occurences,frequency
0,brexit,1808,0.025692
1,deal,580,0.008242
2,eu,544,0.00773
3,trump,541,0.007688
4,theresa,461,0.006551


'Community 3'

Unnamed: 0,word,occurences,frequency
0,trump,1492,0.020299
1,mueller,651,0.008857
2,president,411,0.005592
3,announcement,333,0.004531
4,q,305,0.00415


'Community 4'

Unnamed: 0,word,occurences,frequency
0,trump,83,0.006947
1,david,64,0.005357
2,media,55,0.004604
3,menzies,54,0.00452
4,patreon,51,0.004269


'Community 5'

Unnamed: 0,word,occurences,frequency
0,trump,1087,0.024262
1,president,483,0.010781
2,border,465,0.010379
3,wall,348,0.007767
4,pelosi,257,0.005736


Let's have a look a the most cited words for every community :

In [27]:
# Read previously computed files

data = pd.DataFrame()
for community in communities:
    temp = pd.read_csv(DIR_OUT+"communities_comparison/titles_occurences_"+str(community)+"_"+word+"_"+start_date+"_"+end_date+".csv", sep=';',index_col=0)
    myarray = np.empty(len(temp), dtype=int)
    myarray.fill(community)
    temp.insert(0,column='community',value=myarray)
    temp = temp.sort_values(by='frequency',ascending=True)
    data = pd.concat([data, temp[len(temp)-60:len(temp)]])

display(data)

# Construct histograms

fig = px.histogram(data,
                   x="frequency",
                   y="word",
                   title='Histogram of polarization scores',
                   animation_frame="community",
                   opacity=0.8,
                   range_x=[0,0.035],
                   orientation='h')

fig["layout"].pop("updatemenus")
fig.write_html("figures/60_most_cited_topics.html")
fig.show()

Unnamed: 0,community,word,occurences,frequency
59,0,hw,643,0.001331
58,0,team,647,0.001339
57,0,people,648,0.001342
56,0,us,663,0.001373
55,0,child,665,0.001377
...,...,...,...,...
4,5,pelosi,257,0.005736
3,5,wall,348,0.007767
2,5,border,465,0.010379
1,5,president,483,0.010781


The analysis of occurences of words in words ensembles is now performed.

In [30]:
# Select the words ensemble to consider :

nb_list = 1

trump_list = ['maga','security','trump','wall','evangelist']
national_list = ['shutdown','wall','security','shooting','pelosi','immigration','maga']
surnatural_list = ['ufo','prophecy','christ','truth','spirit','pray','prophet']
conspi_list = ['satanic','fake','truth','alien','lie','moon','9/11','pedophile']
international_list = ['brexit','eu','china','europe','macron','boris','merkel','yemen','asia','africa','india','modi','morrison']

# Filtering

result = []
result_df = pd.DataFrame()

list_of_lists = [trump_list, national_list, surnatural_list, conspi_list, international_list]
list_of_lists_name = ['trump', 'national politics', 'religion and beliefs', 'conspirationist', 'international']
list_name = list_of_lists_name[nb_list]
targets_list = list_of_lists[nb_list]

for j in range(len(targets_list)):
    temp = pd.DataFrame(columns = ['community','rank_title','rank_tag','title_frequency'])
    target = targets_list[j]
    use_tags = False

    for i in range(0,6):
        titles=pd.read_csv(DIR_OUT+"communities_comparison/titles_occurences_"+str(i)+"_"+word+"_"+start_date+"_"+end_date+".csv", sep=';')
        if use_tags :
            if (titles['word']==target).any() or (tags['word']==target).any():
                tags=pd.read_csv(DIR_OUT+"communities_comparison/tags_occurences_"+str(i)+"_"+word+"_"+start_date+"_"+end_date+".csv", sep=';')
                temp.loc[len(temp.index)] = [i, titles[titles['word']==target].index.values[0], tags[tags['word']==word].index.values[0], float(titles[titles['word']==target]['frequency'])]
            else:
                display(target + " is not in community " + str(i))
                temp.loc[len(temp.index)] = [i,math.inf, '-', 0]
        else:
            if (titles['word']==target).any():
                temp.loc[len(temp.index)] = [i, titles[titles['word']==target].index.values[0], '-', float(titles[titles['word']==target]['frequency'])]
            else:
                display(target + " is not in community " + str(i))
                temp.loc[len(temp.index)] = [i,math.inf, '-', 0]

    result = result + [temp]

############ GRAPH : Words occurences

data = pd.DataFrame()
for i in range(0,len(targets_list)):
    data.insert(i, column=targets_list[i], value=result[i]['title_frequency'])

#display(data)

fig = px.line(data, title='Words occurences in different communities : '+list_name+' topics',labels={'value':'frequency','index':'community','variable':'Topics :'},width=600, height=400)
not_active_traces = [targets_list[i] for i in range(len(targets_list)) if i>len(targets_list)*1]
fig.for_each_trace(lambda trace: trace.update(visible='legendonly') if trace.name in not_active_traces else ())

list_of_lists_name = [word.replace(' ','_') for word in list_of_lists_name]
list_name = list_of_lists_name[nb_list]

fig.show()
fig.write_html("figures/words_occurences_"+list_name+".html")


For the selected word ensemble, we can compute a closeness matrix, plotting the distance in occurences ranking of words in the ensemble. The lower the difference, the closer the rankings of words in corresponding communities.

In [38]:
# Build the matrix given the previously run section, with given word ensemble

closeness_matrix = pd.DataFrame(index=communities,columns=communities)

for x in communities:
    for y in communities:
        for target in range(len(targets_list)):
            a = result[target][result[target]['community']==x]['title_frequency']
            b = result[target][result[target]['community']==y]['title_frequency']
            closeness_matrix[x][y]=int(abs(float(a)-float(b))*100000)

# Display the matrix

colors = n_colors('rgb(0, 180, 0)', 'rgb(30, 30, 30)',closeness_matrix.max().max()+1, colortype='rgb')
a = np.stack(closeness_matrix[0].to_numpy()).astype(int)
b = np.stack(closeness_matrix[1].to_numpy()).astype(int)
c = np.stack(closeness_matrix[2].to_numpy()).astype(int)
d = np.stack(closeness_matrix[3].to_numpy()).astype(int)
e = np.stack(closeness_matrix[4].to_numpy()).astype(int)
f = np.stack(closeness_matrix[5].to_numpy()).astype(int)

fig = go.Figure(data=[go.Table(
  header=dict(
    values=['<b>0</b>','<b>1</b>','<b>2</b>','<b>3</b>','<b>4</b>','<b>5</b>'],
    line_color='white', fill_color='white',
    align='center',font=dict(color='black', size=12)
  ),
  cells=dict(
    values=[a,b,c,d,e,f],
    line_color=[np.array(colors)[a], np.array(colors)[b], np.array(colors)[c], np.array(colors)[d], np.array(colors)[e], np.array(colors)[f]],
    fill_color=[np.array(colors)[a], np.array(colors)[b], np.array(colors)[c], np.array(colors)[d], np.array(colors)[e], np.array(colors)[f]],
    align='center', font=dict(color='white', size=11)
    ))
])

fig.show()
fig.write_html("figures/closeness_matrix.html")

Comparison with external datasets is runned here. The code is the same for all datasets, just modify the input file and the output variables (`distance_loco` to `distance_black` or `distance_queer`).

In [22]:
# Select input file to process

dataset = "data/LOCO_small.txt"
#dataset = "data/News_black_voices_titles.txt"
#dataset = "data/News_queer_voices_titles.txt"

######## Exctracting titles in dataset

result = ""
text = ""
unfinished_title = False
nb_titles = 0
chunk = 1000000

with codecs.open(dataset, 'r', encoding = 'latin1') as file:

    #writing the header
    text = file.read(chunk)

    while text:

        if unfinished_title:
            result = result +  text[0:text.find('"txt"')-2]
            text = text[text.find('txt')+8:len(text)]
            unfinished_title = False

        while text.find('"title"') != -1:
            nb_titles = nb_titles + 1
            text = text[text.find('"title"')+9:len(text)]

            if text.find('"txt"') != -1:
                result = result +  text[0:text.find('"txt"')-2] + "\n"
                text = text[text.find('"txt"')+7:len(text)]
            else:
                unfinished_title = True

        text = file.read(chunk)

file.close()

with codecs.open("data/LOCO_titles.txt", 'w', encoding = 'latin1') as file:
    file.write(result)
file.close()

display(nb_titles)

######## Processing titles just as titles in our communities

loco = ""
doc_processed = []

with codecs.open("data/LOCO_titles.txt", 'r', encoding = 'latin1') as file:
    loco = file.read()
file.close()


display("Length of the book is " + str(len(loco)) + " characters, " + str(len(loco)/nlp.max_length) + " times the max length that can be processed at once.")

# Processing
for i in range(0,int(len(loco)/nlp.max_length)+1):
    display('ITERATION ' + str(i))

    # Tokenization
    print("Tokenization")
    doc = nlp(loco[i*nlp.max_length:(i+1)*nlp.max_length])

    # Named entities recognition
    print("Named entities recognition")
    for ent in doc.ents:
        if ent.label_ in named_entities:
            doc_processed = doc_processed + [ent.text]
        
    # Punctuation and stopwords removal
    print("Punctuation and stopwords removal")
    doc_processed = doc_processed + [token.text for token in doc if (not (token.ent_type_ in named_entities) and not token.is_digit and not token.is_stop and not token.is_punct and not (token.text in my_undesired_list) and not is_emoji(token.text))]

    # Removal of undesired characters
    doc_processed = [token.replace(',','') for token in doc_processed]
    doc_processed = [token.replace('.','') for token in doc_processed]
    doc_processed = [token.replace("'s",'') for token in doc_processed]

    doc_processed = [token_text for token_text in doc_processed if not token_text=='']

    print("Output")
    #doc_processed = [x.encode('latin1','ignore').decode("latin1") for x in doc_processed]
    pd.DataFrame(doc_processed).to_csv(DIR_OUT+"loco_processed.csv", sep=";",index=False, header=['word'], encoding='latin1')


###### Computing occurences count in dataset

loco_processed = pd.read_csv(DIR_OUT+"loco_processed.csv", sep=';', encoding='latin1')

display(loco_processed.head)

# Count occurences
loco_processed_lowercase = [str(word).lower() for word in loco_processed['word']]
word_freq = Counter(loco_processed_lowercase)
common_words = word_freq.most_common()
common_words_out = pd.DataFrame(common_words)
common_words_out.columns=['word','occurences']

common_words_out.insert(2, column='frequency', value=common_words_out['occurences']/common_words_out['occurences'].sum())
common_words_out = common_words_out.sort_values(by='frequency', ascending = False)[['word','frequency']]

common_words_out.to_csv(DIR_OUT+"loco_occurences.csv")
display(common_words_out.head(10))

###### Comparing word occurences in dataset and in our titles for a given community

loco_occurences = pd.read_csv(DIR_OUT+"loco_occurences.csv", index_col = 0)
distance_loco = []

# Merge with communities occurences counts
for selected_commu in communities:

    PATH_COMMU = DIR_OUT+"communities_comparison/titles_occurences_"+str(selected_commu)+"_"+word+"_"+start_date+"_"+end_date+".csv"
    titles_occurences = pd.read_csv(PATH_COMMU, sep=';',index_col=0,usecols=['word','frequency'])
    loco_occurences.columns=['word','frequency_loco']

    # Merge to identify common words
    merged = titles_occurences.merge(loco_occurences, on='word', how='outer').fillna(0)

    # Normalize both vectors, not to be influenced by the number of words in the community
    merged['frequency'] = merged['frequency'] / merged['frequency'].abs().max()
    merged['frequency_loco'] = merged['frequency_loco'] / merged['frequency_loco'].abs().max()

    # Apply a logarithmic scale
    merged.insert(1,column='log_frequency',value=np.log10(merged['frequency']+1))
    merged.insert(1,column='log_frequency_loco',value=np.log10(merged['frequency_loco']+1))

    # Compute a 2-norm distance
    merged.insert(1,column='distance',value=merged['log_frequency']-merged['log_frequency_loco'])
    merged.insert(1,column='squared_distance',value=merged['distance'].pow(2))

    distance_loco = distance_loco + [math.sqrt(merged['squared_distance'].sum())]

display(distance_loco)

fig = px.line(distance_loco, title='Distance between topics in a community and topics in conspiracist medias',labels={'index':'community','value':'distance'},width=650, height=400)
fig.update_traces(showlegend=False)
fig.show()

[1.6247752224104015,
 1.4918789637451164,
 1.5241416007834676,
 1.4562036735276687,
 1.7834681277147675,
 1.492001517866818]

Once all datasets have been analyzed, we can build the heatmap of distances.

In [25]:
# Normalization
distance_queer = [element/sum(distance_queer) for element in distance_queer]
distance_black = [element/sum(distance_black) for element in distance_black]
distance_loco = [element/sum(distance_loco) for element in distance_loco]
#distance_climate = [element/sum(distance_climate) for element in distance_climate]

fig = px.imshow([distance_loco,distance_black,distance_queer],
                title="Distance between selected datasets and titles of videos in our communities,<br>\t in terms of frequency of occurence of words",
                labels=dict(x="Community", y="Lexical field" , color="Distance"),
                x=['0','1','2','3','4','5'], 
                y=['Conspiracy','Black voices','Queer voices',],
                width=700, height=400)
fig.show()
fig.write_html("figures/heat_map_datasets.html")