## Dictionary analysis

This notebook briefly explores how well our word list taken from the Brown corpus performs.

In [65]:
import pandas as pd
import altair as alt

In [66]:
df = pd.read_csv("rankings_all.csv")
df.head()

Unnamed: 0,word,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10
0,aa,8917,4198,8708,12012,7543,6911,4447,3475,5821,6429
1,aaa,6971,7777,9069,14558,7847,12615,7249,8145,13844,10212
2,aah,39239,29971,48613,39130,32696,35617,62625,27820,28754,14752
3,aaron,8372,6214,5882,8859,5789,2693,3447,10172,7221,5811
4,ab,12924,6873,7797,8328,4757,7786,1116,7248,9967,7236


## Pandas refresher

In [67]:
df.index

RangeIndex(start=0, stop=24384, step=1)

In [68]:
df.columns

Index(['word', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7',
       'day_8', 'day_9', 'day_10'],
      dtype='object')

In [69]:
df.describe()

Unnamed: 0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10
count,24384.0,24384.0,24384.0,24384.0,24384.0,24384.0,24384.0,24384.0,24384.0,24384.0
mean,24304.484908,23615.73499,26235.387139,24507.053273,24424.123442,23997.031414,26042.450541,25280.291503,24112.234334,23495.701033
std,20874.705012,20802.510303,22608.149187,20672.786885,20889.740027,20947.73058,22061.739783,21179.273475,21077.943607,20282.413323
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,7385.75,6874.5,7301.75,7670.0,7492.75,7060.25,7697.5,7796.75,7036.75,7241.75
50%,17840.5,16943.5,19051.5,18201.5,18012.5,17426.5,19214.5,19152.5,17247.5,17396.5
75%,36677.5,35566.0,41370.5,37139.75,36745.5,36400.25,40574.0,38622.5,36687.0,34981.25
max,81664.0,81665.0,81662.0,81653.0,81657.0,81653.0,81667.0,81665.0,81663.0,81653.0


In [70]:
df.iloc[3]

word      aaron
day_1      8372
day_2      6214
day_3      5882
day_4      8859
day_5      5789
day_6      2693
day_7      3447
day_8     10172
day_9      7221
day_10     5811
Name: 3, dtype: object

Mean value for each row

In [71]:
df.mean(axis=1, numeric_only=True)

0         6846.1
1         9828.7
2        35921.7
3         6446.0
4         7403.2
          ...   
24379     9613.3
24380    63912.8
24381    22435.4
24382    34947.4
24383    11028.7
Length: 24384, dtype: float64

Try and melt our data frame

In [72]:
value_vars = df.columns[1:]

# Melt the columns
df_molten = df.melt(id_vars="word",
                    value_vars=value_vars,
                    value_name="rank",
                    var_name="day",
                    ignore_index=False)

# And then change the value in the new day column
df_molten["day"] = df_molten["day"].apply(lambda s: int(s.split("_")[1]))
df_molten.head()

Unnamed: 0,word,day,rank
0,aa,1,8917
1,aaa,1,6971
2,aah,1,39239
3,aaron,1,8372
4,ab,1,12924


In [73]:
number_days = 10

# Let's get `number_days` different dataframes

df_by_day = []

for i in range(1, number_days + 1):
    daily_df = df_molten[df_molten["day"] == i]
    df_by_day.append(daily_df)

### Showing percentages 

I want to create a new DataFrame that, for each day, shows the percentage of the top X that was hit.

In [96]:
df_1 = df_by_day[0]
df_1.head()

def calc_top_x_percentage(df: pd.DataFrame, x: int) -> float:
    df_top = df[df["rank"] < x]
    return 100 * len(df_top) / float(x)

data = {}

thresholds = [1, 10, 25, 50, 100, 1000]
thresholds_name = lambda x: f"top_{x}"

data["day"] = list(range(1, number_days + 1))

for t in thresholds:

    # First get the column associated with this threshold
    fn = lambda df: calc_top_x_percentage(df, t)
    col = list(map(fn, df_by_day))
    data[thresholds_name(t)] = col

df_thresholds = pd.DataFrame(data)
df_thresholds

Unnamed: 0,day,top_1,top_10,top_25,top_50,top_100,top_1000
0,1,100.0,100.0,100.0,84.0,87.0,92.0
1,2,100.0,90.0,96.0,92.0,96.0,96.7
2,3,100.0,100.0,100.0,100.0,98.0,94.8
3,4,100.0,100.0,92.0,92.0,87.0,82.6
4,5,100.0,60.0,64.0,78.0,84.0,90.8
5,6,100.0,100.0,96.0,98.0,99.0,96.5
6,7,0.0,40.0,68.0,64.0,74.0,87.0
7,8,100.0,90.0,80.0,80.0,79.0,89.6
8,9,100.0,100.0,96.0,88.0,88.0,93.0
9,10,100.0,100.0,100.0,100.0,100.0,93.6


In [100]:
# Now let's melt this data
value_vars = df_thresholds.columns[1:]

df_thresholds_molten = df_thresholds.melt(id_vars="day",
                                          var_name="threshold",
                                          value_vars=value_vars,
                                          value_name="percentage")
df_thresholds_molten["threshold"] = df_thresholds_molten["threshold"].apply(lambda t_str: int(t_str.split("_")[1]))

In [102]:
df_thresholds_molten.head()

Unnamed: 0,day,threshold,percentage
0,1,1,100.0
1,2,1,100.0
2,3,1,100.0
3,4,1,100.0
4,5,1,100.0


In [139]:
alt.Chart(df_thresholds_molten).mark_bar().encode(
    x="threshold:O",
    y="percentage",
    color=alt.Color("day").type("nominal").scale(scheme="viridis"),
    column="day:N"
)

In [138]:
day_limit = 10
data = df_thresholds_molten[df_thresholds_molten["day"] <= day_limit]

alt.Chart(data).mark_bar().encode(
    x="day:O",
    y="percentage",
    color=alt.Color("day").type("nominal").scale(scheme="viridis"),
    column="threshold"
)