### Combining PMI Scores and POS tagging
The below results are based on PMI scores being calculated where a subsample is taken and stop words have been removed. This means that an equal number of women's football and men's football are used, solving issues which may arise from there being an imbalance of data across classes. This is then used as a robustness check.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
# read in csv file
men_df = pd.read_csv('/Users/emmastoklundlee/Documents/Social Data Science - Masters/Thesis/data/pmi_pos_men_subsample_nostopwords.csv')

In [3]:
# read in csv file
women_df = pd.read_csv('/Users/emmastoklundlee/Documents/Social Data Science - Masters/Thesis/data/pmi_pos_women_subsample_nostopwords.csv')

In [8]:
men_df.head(50)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
0,0,NOUN,0.992365,club
1,1,PROPN,0.998454,united
2,2,PROPN,0.998882,liverpool
3,3,PROPN,0.962362,premier
4,4,NOUN,0.997468,team
5,5,NOUN,0.997275,game
6,6,NOUN,0.502868,ball
7,7,VERB,0.998715,said
8,8,PROPN,0.998678,newcastle
9,9,NOUN,0.997075,manager


In [5]:
# showing top adjectives in women
women_df[women_df['POS_Tag']=='ADJ'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
3,3,ADJ,0.845004,final
39,39,ADJ,0.995248,bright
53,53,ADJ,0.944451,female
86,86,ADJ,0.805176,harder
92,92,ADJ,0.969684,australian
95,95,ADJ,0.553204,super
112,112,ADJ,0.868006,white
185,185,ADJ,0.254069,half
186,186,ADJ,0.998191,sexual
188,188,ADJ,0.854388,solo


In [37]:
# showing top adjectives in women
men_df[men_df['POS_Tag']=='ADJ'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
12,12,ADJ,0.780383,left
48,48,ADJ,0.995654,old
90,90,ADJ,0.995094,martial
91,91,ADJ,0.950681,grealish
96,96,ADJ,0.9944,real
100,100,ADJ,0.996871,doubtful
268,268,ADJ,0.970749,pulisic
331,331,ADJ,0.994715,unknown
398,398,ADJ,0.812084,ole
405,405,ADJ,0.986751,sané


In [38]:
# showing top verbs in women
women_df[women_df['POS_Tag']=='VERB'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
1,1,VERB,0.885811,play
21,21,VERB,0.472152,win
125,125,VERB,0.99858,think
197,197,VERB,0.998314,says
223,223,VERB,0.997189,going
229,229,VERB,0.831765,come
303,303,VERB,0.998238,ranked
359,359,VERB,0.992239,gets
413,413,VERB,0.987764,collects
425,425,VERB,0.533639,shot


In [39]:

men_df[men_df['POS_Tag']=='VERB'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
7,7,VERB,0.998715,said
358,358,VERB,0.810393,suspended
615,615,VERB,0.990243,writes
815,815,VERB,0.993278,linked
935,935,VERB,0.886394,burn
1002,1002,VERB,0.517751,moting
1009,1009,VERB,0.997207,asks
1014,1014,VERB,0.988694,relegated
1214,1214,VERB,0.911672,sign
1282,1282,VERB,0.994604,buy


In [40]:

women_df[women_df['POS_Tag']=='ADV'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
19,19,ADV,0.998569,really
79,79,ADV,0.977201,forward
106,106,ADV,0.301531,uswnt
482,482,ADV,0.204145,rytting
764,764,ADV,0.998543,sexually
796,796,ADV,0.998348,abily
1112,1112,ADV,0.99835,globally
1224,1224,ADV,0.825842,standalone
1524,1524,ADV,0.564853,second
1903,1903,ADV,0.998407,internationally


In [41]:

men_df[men_df['POS_Tag']=='ADV'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
1534,1534,ADV,0.998685,apparently
1730,1730,ADV,0.902745,afcon
1963,1963,ADV,0.978879,sideways
3021,3021,ADV,0.998723,unlawfully
3221,3221,ADV,0.998537,oddly
3282,3282,ADV,0.997816,wistfully
3328,3328,ADV,0.99842,agreeably
3534,3534,ADV,0.99866,hardly
3696,3696,ADV,0.998636,laughably
3786,3786,ADV,0.996664,miserably


In [42]:

women_df[women_df['POS_Tag']=='NOUN'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
2,2,NOUN,0.978384,city
6,6,NOUN,0.986919,mins
7,7,NOUN,0.997211,tournament
13,13,NOUN,0.971996,bronze
23,23,NOUN,0.996339,girls
25,25,NOUN,0.939339,lionesses
38,38,NOUN,0.915611,soccer
80,80,NOUN,0.99828,group
81,81,NOUN,0.998715,teams
102,102,NOUN,0.992589,coach


In [43]:

men_df[men_df['POS_Tag']=='NOUN'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
0,0,NOUN,0.992365,club
4,4,NOUN,0.997468,team
5,5,NOUN,0.997275,game
6,6,NOUN,0.502868,ball
9,9,NOUN,0.997075,manager
13,13,NOUN,0.991866,man
18,18,NOUN,0.912113,palace
35,35,NOUN,0.996282,wolves
47,47,NOUN,0.995319,transfer
50,50,NOUN,0.611386,conte


In [44]:

women_df[women_df['POS_Tag']=='PROPN'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
0,0,PROPN,0.998636,arsenal
4,4,PROPN,0.997334,kerr
5,5,PROPN,0.998503,australia
8,8,PROPN,0.878364,matildas
9,9,PROPN,0.998874,hayes
10,10,PROPN,0.990587,wsl
11,11,PROPN,0.998705,lyon
12,12,PROPN,0.997819,mead
14,14,PROPN,0.812643,ve
15,15,PROPN,0.998298,usa


In [45]:
men_df[men_df['POS_Tag']=='PROPN'].head(20)

Unnamed: 0.1,Unnamed: 0,POS_Tag,Confidence_Score,word
1,1,PROPN,0.998454,united
2,2,PROPN,0.998882,liverpool
3,3,PROPN,0.962362,premier
8,8,PROPN,0.998678,newcastle
10,10,PROPN,0.965943,west
11,11,PROPN,0.997732,mourinho
14,14,PROPN,0.998335,guardiola
15,15,PROPN,0.998143,klopp
16,16,PROPN,0.997875,kane
17,17,PROPN,0.998434,david
