# Initial methods given 

### Example of how to work with the quotes data file

In [47]:
# If working on colab :
from google.colab import drive
drive.mount('/content/drive') 

In [2]:
import bz2
import json
import pandas as pd

In [13]:
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

In [None]:
path_to_file = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2' 
path_to_out = '/content/quotes-2020-domains.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            for url in urls:
                tld = get_domain(url)
                domains.append(tld)
            instance['domains'] = domains # updating the sample with domain name
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

### Load the speaker_attributes parquet file

In [7]:
speaker_attributes = pd.read_parquet('speaker_attributes.parquet')

In [8]:
speaker_attributes.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
0,"[Washington, President Washington, G. Washingt...",[+1732-02-22T00:00:00Z],"[Q161885, Q30]",[Q6581097],1395141751,,W000178,"[Q82955, Q189290, Q131512, Q1734662, Q294126, ...",[Q327591],,Q23,George Washington,"[Q698073, Q697949]",item,[Q682443]
1,"[Douglas Noel Adams, Douglas Noël Adams, Dougl...",[+1952-03-11T00:00:00Z],[Q145],[Q6581097],1395737157,[Q7994501],,"[Q214917, Q28389, Q6625963, Q4853732, Q1884422...",,,Q42,Douglas Adams,,item,
2,"[Paul Marie Ghislain Otlet, Paul Marie Otlet]",[+1868-08-23T00:00:00Z],[Q31],[Q6581097],1380367296,,,"[Q36180, Q40348, Q182436, Q1265807, Q205375, Q...",,,Q1868,Paul Otlet,,item,
3,"[George Walker Bush, Bush Jr., Dubya, GWB, Bus...",[+1946-07-06T00:00:00Z],[Q30],[Q6581097],1395142029,,,"[Q82955, Q15982858, Q18814623, Q1028181, Q1408...",[Q29468],,Q207,George W. Bush,"[Q327959, Q464075, Q3586276, Q4450587]",item,"[Q329646, Q682443, Q33203]"
4,"[Velázquez, Diego Rodríguez de Silva y Velázqu...",[+1599-06-06T00:00:00Z],[Q29],[Q6581097],1391704596,,,[Q1028181],,,Q297,Diego Velázquez,,item,


### Load the labels descriptions for quotebank

In [9]:
label_description = pd.read_csv('wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [10]:
label_description.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q31,Belgium,country in western Europe
Q45,Portugal,country in southwestern Europe
Q75,Internet,global system of connected computer networks
Q148,People's Republic of China,sovereign state in East Asia
Q155,Brazil,country in South America


# Dealing with the features

For the questions we want to use, we don't need all the features. We decided to keep the following ones:


*   **date**: To be able to work with them.
*   **numOccurences**: To take out the duplicates.
*   **qids**: The speakers' id, necessary to connect with the speaker_attributes.
*   **quotation**: To be able to assess its positivity.
*   **quoteID**: To identify them.
*   **speaker**: To access their attributes and see if there's multiple qids for the same speaker.

For the 3 remaining features, we explain why we choosed to take them out:


#### phase

As we see on the [description of the phase](https://github.com/epfl-dlab/Quotebank/blob/main/phases.md). All of quotations we will use (2015-2020) are from phase E as they are past June 2014. We check below:

In [24]:
count_E = 0
count_quote = 0
quotes = bz2.open('quotes-2020.json.bz2', 'rb')

for instance in quotes:
    quote = json.loads(instance) # loading a sample
    phase = quote['phase'] # extracting list of links
    if phase == 'E':
        count_E += 1
    count_quote+=1

In [25]:
print(f'We have {count_E*100/count_quote}% of quotes in phase E for 2020')

We have 100.0% of quotes in phase E for 2020


As expected, we don't have any other phase than phase 'E' in the dataset from 2020. We shouldn't have them for 2015-2019. With the test on 2020 and the description's informations, we decided that it safe to drop this feature.

#### probas

We only keep the main speaker, therefore the probabilities and name of the other speakers are not useful for us. We chose to rely on the decisions made by QuoteBanks in choosing the speaker, and only keep the main(s) one(s).

#### Urls

In our research, we don't specificaly look into the source of the quote, expect for the speaker. Therefore, we decided to drop the urls feature as it is irrevelant to our research.

### Dropping the unwanted features

We drop the unwanted features as explained before. We do this by keeping 

In [28]:
def drop_features(path_to_file, path_to_out, max_instances=-1):
    count = 0
    with bz2.open(path_to_file, 'rb') as s_file:
        with bz2.open(path_to_out, 'wb') as d_file:
            for instance in s_file:
                if count == max_instances:
                    return
                instance = json.loads(instance)  # loading a sample
                series = pd.Series(instance)
                instance = series[[
                    'quoteID',
                    'date',
                    'numOccurrences',
                    'qids',
                    'quotation',
                    'speaker',
                    ]].to_dict()
                d_file.write((json.dumps(instance) + '\n').encode('utf-8'))  # writing in the new file
                count += 1

We try it with the 100 first instances:

In [29]:
path_to_file = 'quotes-2020.json.bz2'
path_to_out = 'quotes-2020-dropped-features.json.bz2'

drop_features(path_to_file, path_to_out, 100)

dropped_features_df = pd.read_json(path_to_out, compression='bz2', lines=True)
dropped_features_df.head()

Unnamed: 0,quoteID,date,numOccurrences,qids,quotation,speaker
0,2020-01-28-000082,2020-01-28 08:04:05,1,[],[ D ] espite the efforts of the partners to cr...,
1,2020-01-16-000088,2020-01-16 12:00:13,1,[Q367796],[ Department of Homeland Security ] was livid ...,Sue Myrick
2,2020-02-10-000142,2020-02-10 23:45:54,1,[],... He (Madhav) also disclosed that the illega...,
3,2020-02-15-000053,2020-02-15 14:12:51,2,[],"... [ I ] f it gets to the floor,",
4,2020-01-24-000168,2020-01-24 20:37:09,4,[Q20684375],[ I met them ] when they just turned 4 and 7. ...,Meghan King Edmonds


### Special case: when we need to have the speaker's information

For two of our questions, see the [readMe.md](https://), we want to access the speaker's attributes. If we don't have any informations concerning them, the quote becomes unseful for our research. 

In [43]:
quotes = bz2.open('quotes-2020.json.bz2', 'rb')
count_none = 0
for instance in quotes:
    quote = json.loads(instance)  # loading a sample
    speaker = quote['speaker']  # extracting list of links
    if speaker == 'None':
        count_none += 1
print(count_none)

1800844


In [44]:
print(f'We have {count_none*100/count_quote}% of None ')

We have 34.338097291059555% of None 


As we see, on the quotebank data of 2020 we already have ~34% of missing speaker, labelled as "None". Therefore, we remove them from all the informations and write them in a new Json file.

Why do we remove them? Because if we look at the exemple below, 

In [26]:
## Load a quote

We see that, it is ~73% sure of not knowing who is the right speaker. We could take the as speaker the 

```
['Prime Minister Netanyahu', '0.2445']
```
We have more chance of it being false as it has only ~24% of chances of being him. We prefer to take only the speakers with an high chance of being right. We do it in the below method:


In [27]:
def drop_unknown_speaker(path_to_file, path_to_out, max_instances=-1):
    count = 0
    with bz2.open(path_to_file, 'rb') as s_file:
        with bz2.open(path_to_out, 'wb') as d_file:
            for instance in s_file:
                if count == max_instances:
                    return
                instance = json.loads(instance)  # loading a sample
                speaker = instance['speaker']
                if speaker != 'None':
                    d_file.write((json.dumps(instance) + '\n').encode('utf-8'))  # writing in the new file
                count += 1

We check it with 100 000 instances (to fasten the test part):

In [None]:
path_to_file = 'quotes-2020.json.bz2'
path_to_out = 'quotes-2020-dropped_unknown_speaker.json.bz2'

drop_unknown_speaker(path_to_file, path_to_out, 100000)

dropped_unknown_speaker_df = pd.read_json(path_to_out, compression='bz2', lines=True)

print(dropped_unknown_speaker_df.loc[dropped_unknown_speaker_df.speaker == 'None', 'speaker'].count())

As we see, we can safely see that we take out all the 'None' speaker from the data.

# Data analysis

## Dates

As we are interested in dates, we want to understand their distributions.
As the dates are in the format 'yyyy-mm-dd hh:mm:ss', we check that all the dates are well in 2020 by checking the first 4 characters. Doing so helps us to check if there's any date which is not correctly written or missing:

In [None]:
quotes = bz2.open('quotes-2020.json.bz2', 'rb')
for instance in quotes:
    quote = json.loads(instance)  # loading a sample
    date = instance['date']  # extracting the date
    if date[:4] == '2020':
        count_2020 += 1

In [None]:
print(f'There is {count_2020*100/count_quote}% of date starting with 2020')

So it is as guessed.

To see the distribution of quotes per day of the year, we define a function to change the date format 'yyyy-mm-dd' into an index of a table. To do so we use the library date time.

In [45]:
from datetime import datetime

def get_day_of_year(date):
    date_time_obj = datetime.strptime(date[2:], '%y-%m-%d %H:%M:%S')
    d = date_time_obj.strftime('%-j')
    return d

In [None]:
print(get_day_of_year("2020-12-30 12:26:24"))

Now we can look at the distribution of the dates:

In [46]:
def count_per_day(path_to_file, max_instances=-1):
    count_days = np.zeros(366)
    count = 0
    with bz2.open(path_to_file, 'rb') as s_file:
        for instance in s_file:
            if count == max_instances:
                break
            instance = json.loads(instance)  # loading a sample
            date = instance['date']  # extracting the date
            idx = int(get_day_of_year(date)) - 1
            count_days[idx] += 1
            count += 1
        return count_days

We check for the different years with a sample of 100000 quotes to have a proper representation.

In [None]:
years = ['2015','2016','2017','2018','2019','2020']
day = np.arange(366)

(fig, axes) = plt.subplots(6, constrained_layout=True, figsize=(15, 15))
fig.suptitle('Count of quotes per day of the year', fontsize=16)

for (year, ax) in zip(years, axes):
    path_to_file = 'quotes-{year}.json.bz2'.format(year=year)
    count_days = count_per_day(path_to_file, max_instances=100000)
    ax.plot(day, count_days)
    ax.set_title(year)
    ax.set_xlabel('day of the year')
    ax.set_ylabel('count')

plt.show()

Everytime we look at the dates, we have to be careful about the number of quotations per day. It could lead to wrong results if we want to look at the distribution of a type of quote over time. We see an evident weekly tendency and the quotes for the year 2020 stop at ~110 days. There's also some seemingly unexplicable drop in quotes.For example, the quotes from year 2016 do some strange time where the data is missing.

## Speakers

In [48]:
speakersID = set()
quotes = bz2.open('quotes-2020.json.bz2', 'rb')
for q in quotes:
    quote = json.loads(q)
    speakers = quote['qids']
    for speakerID in speakers:
        speakersID.add(speakerID)

In [49]:
len(speakersID)

323074

# Preparing the data to pair speakers and their occupations

First let's create an array of all the different IDs for occupation and profession and removing duplicates and None values

In [52]:
work = set()
for w in speaker_attributes['occupation'].dropna():
    for i in w:
        work.add(i)
work = list(work)
len(work)

Now we will create a dictionnary which will have the QID as keys and the name and description of the job as values.

In [56]:
dwork = {}
for i in work:
    if i in label_description.index:
        dwork[i]=[label_description.loc[i]['Label'],label_description.loc[i]['Description']]
len(dwork)

As some QID had nan values we decided to count them to see how many there were and then we removed them as we couldn't use them.

In [58]:
summ=0
for i in dwork.values():
    if isinstance(i[0], float):
        summ+=1
        
print("There are",summ,"nan values.")

2460


In [59]:
dic_work={}
for i in dwork:
    if not isinstance(dwork[i], float):
        dic_work[i]=dwork[i]

len(dic_work)

12098

In [None]:
dic_work['Q162555']

Now we have a new dataset that will help us regroup speaker regarding to their jobs

In [60]:
df_work = pd.DataFrame(dic_work.items(), columns=['ID', 'Label'])
df_work[['Label','Description']] = pd.DataFrame(df_work.Label.tolist(), index= df_work.index)
df_work.head()

Storing the new dataset in a parquet file

In [None]:
df_work.to_parquet('data/Id_Work.parquet')

In [3]:
df = pd.read_parquet('data/Id_Work.parquet')
df.head()

Unnamed: 0,ID,Label
0,Q181217,Bey
1,Q11087077,Periodistas
2,Q10853156,alarife
3,Q678003,driving instructor
4,Q13141064,badminton player


### Clustering

We want to regroup the different occupations that are similar together, for example all the job realted to finance together, the one related to military together, medecine, law, etc...
As there are thousands of different occupations it can't reasonably be done manually so we are trying to group them using NLP and clustering. We don't have a proper model yet but we can already see that with Kmeans we already got clusters that make sens. There still are some problem like one cluster having too many unrelated value, which can be fixed by increasing the number of cluster as it acts like a "trash" for all the value that couldn't be clustered with others. But we also see that in the same cluster we would have for example dentist and oral surgeon but then also oral story telling, so we see that there still are flaws in our model. We will keep experimenting in order to get a realy good clustering model.

In [None]:
import numpy as np

from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df = df.dropna()
df.head()

In [None]:
texts = df.Description # Extract text
target = df.Label # Extract target

Instead of just using the description we added the name of the occupation with it so that job with similar name would have a higher chance to be together

In [None]:
job_desc = target.str.cat(texts, sep=' , ')
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(job_desc)

In [None]:
number_of_clusters = 200

model = KMeans(n_clusters=number_of_clusters, 
               init='k-means++', 
               max_iter=100, # Maximum number of iterations of the k-means algorithm for a single run.
               n_init=1)  # Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.

model.fit(X)

In [None]:
lab = model.labels_
df_cluster = df.copy()
df_cluster['cluster'] = lab
df_cluster.head()

In [None]:
df_cluster[df_cluster['cluster']==5].head()

In [None]:
for i in range(number_of_clusters):
    print("cluster",i)
    print(len(df_cluster[df_cluster['cluster']==i]))

# First approach of the sentiment analysis 

## Polarity scores of quotes using Vader

In [66]:
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

path_to_file = 'quotes-2020.json.bz2' 
path_to_out = 'quotes-2020-polarity-scores.json.bz2'

i = 0
with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance)
            quote = instance['quotation']
            scores = sid.polarity_scores(quote)
            #instance['polarity_scores'] = scores
            print(scores)
            i+=1
            if(i==10):
                break
            # d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

{'neg': 0.0, 'neu': 0.79, 'pos': 0.21, 'compound': 0.872}
{'neg': 0.167, 'neu': 0.619, 'pos': 0.214, 'compound': 0.0}
{'neg': 0.113, 'neu': 0.84, 'pos': 0.046, 'compound': -0.4939}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.899, 'pos': 0.101, 'compound': 0.3041}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.038, 'neu': 0.888, 'pos': 0.074, 'compound': 0.3818}
{'neg': 0.121, 'neu': 0.715, 'pos': 0.165, 'compound': 0.2484}
{'neg': 0.0, 'neu': 0.691, 'pos': 0.309, 'compound': 0.8277}
