## Read in everything

In [3]:
import pandas as pd
from matplotlib import pyplot as plt
from math import ceil
import numpy as np
pd.options.display.max_columns = 1000
from google.colab import files
import urllib.request
from zipfile import ZipFile
from io import BytesIO

ModuleNotFoundError: No module named 'google'

Open proc_zip from url as a dataframe

In [4]:
proc_zip_url = 'https://github.com/fielddaylab/opengamedata/blob/master/jupyter/lakeland_data/LAKELAND_20191201_to_20191231_b2cf46d_proc.zip?raw=true'
resp = urllib.request.urlopen(proc_zip_url)
zipfile = ZipFile(BytesIO(resp.read()))
with zipfile.open(zipfile.namelist()[0]) as f:
    df = pd.read_csv(f)

NameError: name 'urllib' is not defined

In [None]:
print(df.shape)
df.head()

## Filtering
- Filtered out the sessions that used SPYPARTY (debug=1)
- Filtered any sessions that were not between 60 seconds and 3600 seconds
- Filtered out continues
- Filtered out any sessions that did not have at least 3 active events in lvl0 and 10 in the session

In [None]:
df = df[df['debug'] < 1]
df = df[df['continue'] > 0]
df = df[(60 < df['sessDuration']) & (df['sessDuration'] < 60*60*1)]
df = df[df['lvl0_ActiveEventCount'] > 2]
df = df[df['sess_ActiveEventCount'] > 9]
df

## Filter out which columns I want to use.
#### Print all level features available

In [None]:
hover_f_avg = lambda i, item: f'lvl{i}_avg_num_tiles_hovered_before_placing_{item}'
hover_f_tot = lambda i, item: f'lvl{i}_tot_num_tiles_hovered_before_placing_{item}'
for i in range(10):
  for item in ['home','food','farm','fertilizer','livestock','skimmer','sign','road']:
    df[hover_f_tot(i,item)] = df[hover_f_avg(i,item)].fillna(0) * df[f'lvl{i}_count_buy_{item}'].fillna(0)
  df[hover_f_tot(i,"buys")] = df.loc[:,hover_f_tot(i,"home"):hover_f_tot(i,"road")].sum(axis=1)
  df[f'lvl{i}_count_buys'] = df.loc[:,f'lvl{i}_count_buy_home':f'lvl{i}_count_buy_road'].fillna(0).sum(axis=1)

In [None]:
all_feats = [(i,c) for i,c in enumerate(df.columns)]
lvl0_feats = [f for f in all_feats if f[1].startswith("lvl0")]
lvl0_feats

#### Decided to use the following



In [None]:
feature_names = [
  'tot_num_tiles_hovered_before_placing_buys',
  'count_buy_home',
  'count_buy_farm',
  'count_buy_livestock',
  'max_num_food_marked_use',
  'count_buys'
]
levels = range(6)
df2 = df.loc[:,[f'lvl{i}_{fn}' for fn in feature_names for i in levels]].fillna(0)
for fn in feature_names:
  df2[f'sum_{fn}_to_lvl_{levels[-1]}'] = df2[[f'lvl{i}_{fn}' for i in levels]].sum(axis=1)
df2[f'avg_num_hovers_per_buy_to_lvl_{levels[-1]}'] = df2[f'sum_tot_num_tiles_hovered_before_placing_buys_to_lvl_{levels[-1]}'] / df2[f'sum_count_buys_to_lvl_{levels[-1]}']

In [None]:
df2

In [None]:
df3 = df2.loc[:, 'sum_tot_num_tiles_hovered_before_placing_buys_to_lvl_5':'avg_num_hovers_per_buy_to_lvl_5'].copy()
df3 = df3.drop('sum_count_buys_to_lvl_5',axis=1)
df3 = df3.drop('sum_max_num_food_marked_use_to_lvl_5',axis=1)
df3 = df3.rename(lambda s: s[4:-9],axis=1)
df = df.drop(df3[df3['num_hovers_per_buy'] < 1].index)
df3 = df3.drop(df3[df3['num_hovers_per_buy'] < 1].index)
df3 = df3.fillna(0)
df3

In [None]:
df3.describe()

In [None]:
df3.hist(figsize=(20,20),bins=50)

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans,DBSCAN
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

scale_normalize = make_pipeline(RobustScaler(), Normalizer()) #, PCA(n_components=3), KMeans(5))
X = scale_normalize.fit_transform(df3.to_numpy())
pd.DataFrame(X, columns = df3.columns).hist(figsize=(20,20),bins=50)

In [None]:
import numpy as np
U,S,V = np.linalg.svd(X)
eigvals = S**2 / np.sum(S**2)
fig = plt.figure(figsize=(8,5))
sing_vals = np.arange(X.shape[1]) + 1
plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')

Begin Using PCA projections

In [None]:
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize=(20,20))
ax = plt.axes(projection='3d')
pca = PCA(n_components=3)
projected = pca.fit_transform(X)
kmeans = KMeans(5).fit(projected)
intent_labels=kmeans.labels_
ax.scatter3D(projected[:,0], projected[:,1], projected[:,2], c=kmeans.labels_);

In [None]:
def calc_kmeans_errors(kmeans, X):
  errors = [0]*(kmeans.n_clusters)
  for r,l in zip(X, kmeans.labels_):
    errors[l] += np.linalg.norm(r-kmeans.cluster_centers_[l])
  return(sum(errors),sum(errors)/kmeans.n_clusters)
for nd in range(1,6):
  projected = PCA(nd).fit_transform(X)
  for k in range(2,11):
    print(f'nd={nd}, k={k}')
    kmeans = KMeans(k).fit(projected)
    print(f'error = {[x//1 for x in calc_kmeans_errors(kmeans,projected)]}')

Begin using Factor Analysis

In [None]:
from sklearn.decomposition import FactorAnalysis
from mpl_toolkits.mplot3d import Axes3D
plt.figure(figsize=(20,20))
ax = plt.axes(projection='3d')
transformer = FactorAnalysis(n_components=3, random_state=0)
projected = transformer.fit_transform(X)
kmeans = KMeans(5).fit(projected)
intent_labels=kmeans.labels_
ax.scatter3D(projected[:,0], projected[:,1], projected[:,2], c=kmeans.labels_);

In [None]:
def calc_kmeans_errors(kmeans, X):
  errors = [0]*(kmeans.n_clusters)
  for r,l in zip(X, kmeans.labels_):
    errors[l] += np.linalg.norm(r-kmeans.cluster_centers_[l])
  return(sum(errors),sum(errors)/kmeans.n_clusters)
for nd in range(1,6):
  projected = FactorAnalysis(nd, random_state=0).fit_transform(X)
  for k in range(2,6):
    print(f'nd={nd}, k={k}')
    kmeans = KMeans(k).fit(projected)
    print(f'error = {[x//1 for x in calc_kmeans_errors(kmeans,projected)]}')

In [None]:
plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')

## Nonintentional - milk produced, corn produced, deaths, achievements, blooms, money earned

In [None]:
feats = [
         'count_food_produced',
         'count_milk_produced',
         'count_deaths',
         'count_blooms',
         'count_achievements',
         'money_earned',
]
levels = range(6)
new_df_dict = {}
for f in feats:
  new_df_dict[f] = df[[f'lvl{i}_{f}' for i in levels]].fillna(0).sum(axis=1)
unintent_df = pd.DataFrame(new_df_dict)

In [None]:
X = scale_normalize.fit_transform(unintent_df.to_numpy())
pd.DataFrame(X, columns = unintent_df.columns).hist(figsize=(20,20),bins=50)

In [None]:
U,S,V = np.linalg.svd(X)
eigvals = S**2 / np.sum(S**2)
fig = plt.figure(figsize=(8,5))
sing_vals = np.arange(X.shape[1]) + 1
plt.plot(sing_vals, eigvals, 'ro-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')

In [None]:
error_per_cluster = []
for nd in [4]:
  projected = PCA(nd).fit_transform(X)
  for k in range(2,11):
    print(f'nd={nd}, k={k}')
    kmeans = KMeans(k).fit(projected)
    print(f'error = {[x//1 for x in calc_kmeans_errors(kmeans,projected)]}')
    error_per_cluster.append(calc_kmeans_errors(kmeans,projected)[0])
plt.plot(range(2,11), error_per_cluster, 'ro-', linewidth=2)
plt.title('Finding K')
plt.xlabel('k')
plt.ylabel('Avg Summed Distances to Cluster Center')

In [None]:
plt.figure(figsize=(20,20))
ax = plt.axes(projection='3d')
pca = PCA(n_components=4)
projected = pca.fit_transform(X)
kmeans = KMeans(6).fit(projected)
unintent_labels = kmeans.labels_

ax.scatter3D(projected[:,0], projected[:,1], projected[:,2], c=kmeans.labels_,cmap='Set1');

In [None]:
cluter_to_cluster_df = pd.DataFrame({'intent': intent_labels, 'unintent': unintent_labels})
cluter_to_cluster_df

In [None]:
cluter_to_cluster_df.plot.scatter('intent','unintent')

In [None]:
error_per_cluster = []
k_vals = range(2,30)
for k in k_vals:
  tX = cluter_to_cluster_df.to_numpy()
  kmeans = KMeans(k).fit(tX)
  error_per_cluster.append(calc_kmeans_errors(kmeans,tX)[0])
plt.plot(k_vals, error_per_cluster, 'ro-', linewidth=2)
plt.title('Finding K')
plt.xlabel('k')
plt.ylabel('Avg Summed Distances to Cluster Center')

In [None]:
final_kmeans = KMeans(12).fit(cluter_to_cluster_df.to_numpy())
final_labels = final_kmeans.labels_
cluter_to_cluster_df.plot.scatter('intent','unintent',c=final_labels,cmap='Paired')

In [None]:

from collections import Counter
Counter(final_labels)

In [None]:
len(final_labels)

In [None]:
df3.plot.scatter(1,2,c=final_labels,cmap='tab20')

In [None]:
important_df = pd.concat([df3, unintent_df],axis=1,sort=False)
columns = important_df.columns
normalized_important_df = pd.DataFrame(scale_normalize.fit_transform(important_df.to_numpy()), columns = columns)
for x in range(len(columns)):
  for y in range(x+1,len(columns)):
    title = f'{columns[y]} vs {columns[x]}'
    normalized_important_df.plot.scatter(x,y,c=final_labels, cmap='Paired',figsize=(10,10),title=f'{columns[y]} vs {columns[x]}')
    plt.savefig(f'01312019_{title}.png')
    #files.download(f'01312019_{title}.png')

In [None]:
normalized_important_df