In [1]:
import pandas as pd
df = pd.read_csv('aita_clean.csv')
df['text'] = df["title"] + df["body"].fillna("")
print(len(df))

63215

In [2]:
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(max_features=10000,ngram_range=(1,1))
corpus_tfidf = tvec.fit_transform(df.text)
chi2score = chi2(corpus_tfidf, df.is_asshole)[0]

In [3]:
wscores = zip(tvec.get_feature_names(), chi2score)
wchi2 = sorted(wscores, key=lambda x:x[1])
topchi2 = wchi2[-10:]
print(topchi2)

[('girlfriend', 9.989495597528986), ('wanting', 10.028783637027201), ('lane', 12.774934236701373), ('family', 13.119185738168017), ('my', 15.107753043592673), ('dad', 16.50492328810366), ('edit', 16.569261497891993), ('mother', 18.206993216980287), ('wife', 20.276032064143966), ('mom', 28.579384986092162)]


In [4]:
# Cool, let's compare! Does saying mom/mother predispose you to say asshole or not?

df['contains_mom'] = df['text'].str.contains("mom|mother",case =False)
df['contains_dad'] = df['text'].str.contains("dad|father",case =False)
df['contains_gf'] = df['text'].str.contains("wife|girlfriend|gf",case =False)
df['contains_bf'] = df['text'].str.contains("husband|boyfriend|bf",case =False)

yta = df[df['is_asshole']==1]
nta = df[df['is_asshole']==0]



In [5]:
# Calculate log odds for each
import numpy as np

df['contains_mom'] = df['text'].str.contains("mom|mother",case =False)
df['contains_dad'] = df['text'].str.contains("dad|father",case =False)
df['contains_gf'] = df['text'].str.contains("wife|girlfriend|gf",case =False)
df['contains_bf'] = df['text'].str.contains("husband|boyfriend|bf",case =False)

yta = df[df['is_asshole']==1]
nta = df[df['is_asshole']==0]

odds_mom = np.log2( np.mean(yta['contains_mom']) / np.mean(nta['contains_mom']))
odds_dad = np.log2( np.mean(yta['contains_dad']) / np.mean(nta['contains_dad']))
odds_gf = np.log2( np.mean(yta['contains_gf']) / np.mean(nta['contains_gf']))
odds_bf = np.log2( np.mean(yta['contains_bf']) / np.mean(nta['contains_bf']))

who = ["Mom","Dad","Wife/Girlfriend","Husband/Boyfriend"]
odds = [odds_mom, odds_dad, odds_gf,odds_bf]

odds_df = pd.DataFrame(zip(who,odds), columns = ["Who","LogOdds"])
odds_df['direction'] = odds_df['LogOdds'] > 0


In [7]:
import altair as alt

# Specify the font - comment out if you do not have this font installed on your system
def brandon():
    font = "Brandon Grotesque"
    
    return {
        "config" : {
             "title": {'font': font},
             "axis": {
                  "labelFont": font,
                  "titleFont": font
             },
             "header": {
                  "labelFont": font,
                  "titleFont": font
             },
             "legend": {
                  "labelFont": font,
                  "titleFont": font
             }
        }
    }

alt.themes.register('brandon',brandon)
alt.themes.enable('brandon')

chart = alt.Chart(odds_df).mark_bar().encode(
    x=alt.X('LogOdds', axis=alt.Axis(title="Log odds YOU are the asshole")),
    y=alt.Y('Who',sort ="x",axis=alt.Axis(title="If you mention...")),
    color = alt.Color('direction',legend=None)
)
chart

In [19]:
# Actually, let's bootstrap so we can be at all confident these odds aren't really zero
#mom = []
#dad = []
#gf = []
#bf = []
#
#boots = 100
#
#for b in range(0,boots):
#    df_samp = df.sample(len(df), replace=True)
#    df_samp['contains_mom'] = df_samp['text'].str.contains("mom|mother",case =False)
#    df_samp['contains_dad'] = df_samp['text'].str.contains("dad|father",case =False)
#    df_samp['contains_gf'] = df_samp['text'].str.contains("wife|girlfriend|gf",case =False)
#    df_samp['contains_bf'] = df_samp['text'].str.contains("husband|boyfriend|bf",case =False)
#
#    yta = df_samp[df_samp['is_asshole']==1]
#    nta = df_samp[df_samp['is_asshole']==0]
#
#    odds_mom = np.log2( np.mean(yta['contains_mom']) / np.mean(nta['contains_mom']))
#    odds_dad = np.log2( np.mean(yta['contains_dad']) / np.mean(nta['contains_dad']))
#    odds_gf = np.log2( np.mean(yta['contains_gf']) / np.mean(nta['contains_gf']))
#    odds_bf = np.log2( np.mean(yta['contains_bf']) / np.mean(nta['contains_bf']))
#
#
#    mom.append(odds_mom)
#    dad.append(odds_dad)
#    gf.append(odds_gf)
#    bf.append(odds_bf)



KeyboardInterrupt: 

In [37]:
#who = ["Mom"]*len(mom) + ["Dad"] *len(dad) + ["GF"]*len(gf) + ["BF"]*len(bf)
#scores = mom + dad + gf + bf

#out = pd.DataFrame(zip(who,scores), columns = ["Who","LogOdds"])


