# Data Analysis

## Setup
### Import libraries

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import wordlemodule
from scipy.stats import pearsonr
import pandas as pd
import seaborn as sns

### Open data from saved file and create a WordleModule object with it

In [None]:
# Open pickle file
with open('Data/Script data/Imported_email_data_with_ints.pkl','rb') as file:
    full_data = pickle.load(file)

# Create custom wordle object with input data
data = wordlemodule.WordleData(full_data)

## Plots

### Histogram of average solve score

In [None]:
hist_solve_score = plt.figure(1)
ax = hist_solve_score.subplots()

ax.hist(data.data_arr, bins=np.arange(0.5,8.5,1))

ax.set_ylabel('Number of occurences')
ax.set_xlabel('Number of guesses')
ax.set_xticklabels(['','1','2','3','4','5','6','Unsolved'])
ax.set_title('Histogram of Wordle solve score')
ax.legend(data.data_dict.keys())

### Solve score over time

To calculate 

In [None]:
# Set up pandas dataframe with rolling average
df = data.rolling(20)


# Plot scatterplot for solve score over time
score_over_time = plt.figure(2)

fig, ax = plt.subplots()
plot = df.plot(title="Score over time (rolling average)",
               subplots=False,
               ax=ax, 
               sharey=True, 
               sharex=True,
               ylim=[0,6], 
               yticks=range(1,7,1))

#for myax in plot:
#    myax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

#fig.tight_layout()
fig.supylabel('Solve score')
fig.supxlabel('Puzzle number')

### Solve score over time (heatmap)


In [None]:
# Set up pandas dataframe with rolling average
df = data.rolling(20)

# Plot
ax = sns.heatmap(df.transpose(),
                 cmap='turbo',
                 vmin=0,
                 vmax=6)
ax.set(title='Solve score over time',
       xlabel='Puzzle number')

### By week-day
Does solve score depened on the day of the week? For example, people may have more time to think about guesses on weekends, and therefore have higher scores on weekends as compared to weekdays.

However, 

In [None]:
# Use the Wordle object functions to return 
df_weekly_sum, df_weekly_score = data.weekly()

# Create figure and axes for subplots
f, myax = plt.subplots(2, 3)
f.set_figwidth(18)
f.set_figheight(10)
#f.suptitle('Number of puzzles attempted by weekday')

# Subplot column 0: number/score of puzzles attempted
df_weekly_sum.plot.bar(ylabel='Mean number of puzzles attempted',
                       xlabel='',
                       rot=45,
                       ax=myax[0,0])
df_weekly_score.plot.bar(ylabel='Mean score of puzzles attempted',
                       xlabel='',
                       rot=45,
                       ax=myax[1,0])

# Add a weekday and weekend variable and change to long-form data
df_weekly_sum['Day type'] = 'Weekday'
df_weekly_score['Day type'] = 'Weekday'
weekends = ['Sunday','Saturday']

df_weekly_sum.loc[weekends,'Day type'] = 'Weekend'
df_weekly_sum_long = pd.melt(df_weekly_sum, id_vars='Day type', var_name='Person', value_name='Value')


df_weekly_score.loc[weekends,'Day type'] = 'Weekend'
df_weekly_score_long = pd.melt(df_weekly_score, id_vars='Day type', var_name='Person', value_name='Value')

# Subplot column 1: number/score of puzzles attempted, by weekend and weekday and by person
sns.barplot(df_weekly_sum_long,
            x = 'Person',
            y = 'Value',
            hue = 'Day type',
            legend=True,
            ax=myax[0,1])
myax[0,1].set(ylabel = '',xlabel='')

sns.barplot(df_weekly_score_long,
            x = 'Person',
            y = 'Value',
            hue = 'Day type',
            legend=True,
            ax=myax[1,1])
myax[1,1].set(ylabel = '',xlabel='')

# Subplot column 2: number/score of puzzles attempted, by weekend and weekday
sns.barplot(df_weekly_sum_long,
            x = 'Day type',
            y = 'Value',
            legend=True,
            ax=myax[0,2])
myax[0,2].set(ylabel = '',xlabel='')

sns.barplot(df_weekly_score_long,
            x = 'Day type',
            y = 'Value',
            legend=True,
            ax=myax[1,2])
myax[1,2].set(ylabel = '',xlabel='')

### Longest streak (win and no play)

### Scatterplot
- compare score of each game - are same words hard for everyone?
Make regressions for each comparison to see who tracks the best

In [None]:
# Seaborn pairplot of each person's score compared to others
df = pd.DataFrame(data.data_arr, columns=data.data_dict.keys())
g = sns.pairplot(df,kind='reg')

# Calculate Pearson's r-squared and t-test p-values

#df_rsq, df_p = data.corr_pvals()

#def corrfunc(x, y, ax=None, hue=None, **kws):
#    temp = df.corr()
#    ax = ax or plt.gca()
#    ax.annotate(f'r-sqr = {temp:.2f}', xy=(.1, .9), xycoords=ax.transAxes)

#g.map_lower(corrfunc)



In [None]:
df = pd.DataFrame(data.data_arr, columns=data.data_dict.keys())

### Which letter slots are easiest to guess? Slot 1?

### What are the most common letter patterns and their guess rates?