# 0 - Setup

In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
from tidytext import unnest_tokens

import os
import gn_modules.store as gn_store
import gn_modules.secure_dotenv as gn_dotenv

gn_dotenv.load_dotenv_secure()

In [2]:
NAME_FILE = 'prenoms_clean.csv'

# 1 - get data from the db

In [3]:
def read_mongo(query={}, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    cursor = gn_store.connect_db().find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [4]:
# Every article after the 1st of january 2021
df = read_mongo(query= { 'date' : { '$gte' : '2021-02-01'}})
df.head()

Unnamed: 0,link,date,title,text,source,authors,categories,sub_categories,keywords,word_count,gender_count,masculinity_rate,names
0,http://laroche.fr/,2021-02-16,Souhaiter planche mort davantage prince part.,Cercle acte port hasard. Professeur un clef po...,"[rejeter, instant]","[Monique Voisin, Lucy Daniel]",Culture,cerveau,"[sauter, apprendre, président]",4129,"{'male': 15, 'female': 8, 'unknown': 4}",0.924862,"[{'name': 'Paul Ollivier', 'gender': 'male', '..."
1,http://www.gillet.net/,2021-02-01,Île oeuvre oui fortune préférer.,Chaque contenter simplement temps eh premier p...,"[serrer, as]","[Luc-Théophile Bouchet, Élodie Boulanger-Duhamel]",International,foi,"[rejoindre, chien, silencieux]",3079,"{'male': 15, 'female': 9, 'unknown': 4}",0.435256,"[{'name': 'Jacques Marchand', 'gender': 'male'..."
2,http://faure.org/,2021-02-07,Doute autre est peine disposer sous.,Or premier tromper choix regard. Voisin assure...,"[haïr, mensonge]","[Thierry Lenoir, Marthe Le Renard]",International,règle,"[impossible, robe, action]",4105,"{'male': 13, 'female': 6, 'unknown': 2}",0.661014,"[{'name': 'Victor Tessier', 'gender': 'male', ..."
3,https://www.grondin.com/,2021-02-05,Malgré rouge devenir société.,Conversation toute chute moyen sou pleurer con...,"[compte, rassurer]","[Bernard Richard, Victoire Samson du Muller]",International,armée,"[mode, vieil, absence]",3578,"{'male': 14, 'female': 5, 'unknown': 2}",0.346604,"[{'name': 'Victor Fernandez', 'gender': 'male'..."
4,https://www.sauvage.com/,2021-02-07,Remplir témoin sujet mode haïr proposer.,Promener voile nez fauteuil étranger éclat sub...,"[abri, tout]","[Tristan de la Ramos, Margaret Le Gall-Benoit]",International,détail,"[sourd, prouver, demander]",3498,"{'male': 13, 'female': 9, 'unknown': 4}",0.237862,"[{'name': 'Charles Riou', 'gender': 'male', 'o..."


# 2 - Get the names df

In [5]:
names_df = pd.read_csv(NAME_FILE, sep=';')
names_df = names_df.rename(columns={'preusuel': 'word'})
names_df.head()

Unnamed: 0,word,sexratio_prenom,n,sexe_prenom
0,aadam,1.0,24,Homme
1,aadel,1.0,55,Homme
2,aadil,1.0,177,Homme
3,aakash,1.0,25,Homme
4,aalia,0.0,26,Femme


# 3 - Compute masculinity score

In [6]:
txt_tokens = unnest_tokens(df[['text']], 'word', 'text')
txt_tokens['id'] = txt_tokens.index
txt_tokens

Unnamed: 0,word,id
0,cercle,0
0,acte,0
0,port,0
0,hasard,0
0,professeur,0
...,...,...
4679,rencontre,4679
4679,souhaiter,4679
4679,printemps,4679
4679,comme,4679


In [7]:
txt_tokens_with_name = pd.merge(txt_tokens, names_df, how='left')
txt_tokens_with_name

Unnamed: 0,word,id,sexratio_prenom,n,sexe_prenom
0,cercle,0,,,
1,acte,0,,,
2,port,0,,,
3,hasard,0,,,
4,professeur,0,,,
...,...,...,...,...,...
3183025,rencontre,4679,,,
3183026,souhaiter,4679,,,
3183027,printemps,4679,,,
3183028,comme,4679,,,


In [8]:
matches = txt_tokens_with_name.dropna()
matches = matches.drop(columns = ['n'])
matches

Unnamed: 0,word,id,sexratio_prenom,sexe_prenom
707,françois,1,0.999358,Homme
1269,claire,1,0.001380,Femme
1425,claire,2,0.001380,Femme
2625,françois,3,0.999358,Homme
2858,pierre,4,0.998965,Homme
...,...,...,...,...
3180952,pierre,4676,0.998965,Homme
3181231,pierre,4677,0.998965,Homme
3182025,françois,4678,0.999358,Homme
3182373,claire,4679,0.001380,Femme


In [9]:
m_rate = txt_tokens_with_name.groupby('id')['sexratio_prenom'].mean()
m_rate

id
0            NaN
1       0.500369
2       0.001380
3       0.999358
4       0.500172
          ...   
4675    0.001380
4676    0.999161
4677    0.998965
4678    0.999358
4679    0.001380
Name: sexratio_prenom, Length: 4680, dtype: float64

In [10]:
df['masculinity_rate'] = m_rate
df.head()

Unnamed: 0,link,date,title,text,source,authors,categories,sub_categories,keywords,word_count,gender_count,masculinity_rate,names
0,http://laroche.fr/,2021-02-16,Souhaiter planche mort davantage prince part.,Cercle acte port hasard. Professeur un clef po...,"[rejeter, instant]","[Monique Voisin, Lucy Daniel]",Culture,cerveau,"[sauter, apprendre, président]",4129,"{'male': 15, 'female': 8, 'unknown': 4}",,"[{'name': 'Paul Ollivier', 'gender': 'male', '..."
1,http://www.gillet.net/,2021-02-01,Île oeuvre oui fortune préférer.,Chaque contenter simplement temps eh premier p...,"[serrer, as]","[Luc-Théophile Bouchet, Élodie Boulanger-Duhamel]",International,foi,"[rejoindre, chien, silencieux]",3079,"{'male': 15, 'female': 9, 'unknown': 4}",0.500369,"[{'name': 'Jacques Marchand', 'gender': 'male'..."
2,http://faure.org/,2021-02-07,Doute autre est peine disposer sous.,Or premier tromper choix regard. Voisin assure...,"[haïr, mensonge]","[Thierry Lenoir, Marthe Le Renard]",International,règle,"[impossible, robe, action]",4105,"{'male': 13, 'female': 6, 'unknown': 2}",0.00138,"[{'name': 'Victor Tessier', 'gender': 'male', ..."
3,https://www.grondin.com/,2021-02-05,Malgré rouge devenir société.,Conversation toute chute moyen sou pleurer con...,"[compte, rassurer]","[Bernard Richard, Victoire Samson du Muller]",International,armée,"[mode, vieil, absence]",3578,"{'male': 14, 'female': 5, 'unknown': 2}",0.999358,"[{'name': 'Victor Fernandez', 'gender': 'male'..."
4,https://www.sauvage.com/,2021-02-07,Remplir témoin sujet mode haïr proposer.,Promener voile nez fauteuil étranger éclat sub...,"[abri, tout]","[Tristan de la Ramos, Margaret Le Gall-Benoit]",International,détail,"[sourd, prouver, demander]",3498,"{'male': 13, 'female': 9, 'unknown': 4}",0.500172,"[{'name': 'Charles Riou', 'gender': 'male', 'o..."


# 4 - Retrieve extracted names (wip)

In [11]:
names = matches.groupby(['id', 'word'])['word'].count()

In [12]:
names.reset_index(col_fill = 'count')

TypeError: reset_index() got an unexpected keyword argument 'col_fill'