# Dotin Model Prep (Loading in Votes Data)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import json

In [2]:
path = Path("./data") # CHANGE THIS TO YOUR WORKING DIRECTORY
os.listdir(path)

['data.zip',
 'DS19-01 description.docx',
 'mouse_paths.csv',
 'validations.csv',
 'mouse-flat.csv',
 'votes.csv']

In [3]:
mouse = pd.read_csv(path/"mouse-flat.csv") # note: we are loading the flat file
votes = pd.read_csv(path/"votes.csv")

In [4]:
mouse.shape

(8123046, 13)

In [5]:
mouse.head()

Unnamed: 0.1,Unnamed: 0,user_id,action,cord_x,cord_y,radio,time_since,id,created_at,updated_at,timeElapsed,window_x,window_y
0,0,365,s,0,0.0,,0.0,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860
1,1,365,m,601,167.0,,0.22,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860
2,2,365,m,602,166.0,,0.304,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860
3,3,365,c,602,166.0,,0.432,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860
4,4,365,m,602,166.0,,0.518,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860


In [6]:
votes.shape

(147980, 8)

In [7]:
votes.head()

Unnamed: 0,id,attr,value,score,user_id,created_at,updated_at,weight
0,1,psy,bf_1,1,365,2019-03-05 06:56:35.625896,2019-03-05 06:56:35.625896,0.0
1,2,psy,bf_2,1,365,2019-03-05 06:56:35.628088,2019-03-05 06:56:35.628088,0.0
2,3,psy,bf_3,1,365,2019-03-05 06:56:35.629157,2019-03-05 06:56:35.629157,0.0
3,4,psy,bf_4,1,365,2019-03-05 06:56:35.630054,2019-03-05 06:56:35.630054,0.0
4,5,psy,bf_5,1,365,2019-03-05 06:56:35.630927,2019-03-05 06:56:35.630927,0.0


In [12]:
votes_grouped = votes.groupby("user_id")
user_id, validation = [], []
for user, df in votes_grouped:
    # according to data description, users must pass atleast 4 of the six validaiton questions 
    passes = 0
    # check 1
    if int(df[df.value=="bf_validation_1"].score) == int(df[df.value=="bf_1"].score): passes += 1
    # check 2
    if int(df[df.value=="bf_validation_2"].score) == int(df[df.value=="bf_21"].score): passes += 1
    # check 3
    if int(df[df.value=="miq_validation_3"].score) in [2,4]: passes += 1
    # check 4
    if int(df[df.value=="miq_validation_4"].score) in [2,4]: passes += 1
    # check 5
    if int(df[df.value=="pgi_validation_5_Liking"].score) == 6 or int(df[df.value=="pgi_validation_5_Competence"].score) == 6: passes += 1
    # check 6
    if int(df[df.value=="pgi_validation_6_Liking"].score) == 4 or int(df[df.value=="pgi_validation_6_Competence"].score) == 4: passes += 1
    
    # appending to lists
    user_id.append(user)
    if passes>=4: 
        validation.append(True)
    else:
        validation.append(False)

validations = pd.DataFrame({"user_id":user_id,"validation":validation})

In [13]:
validations.head()

Unnamed: 0,user_id,validation
0,365,False
1,371,True
2,373,False
3,374,True
4,375,False


In [15]:
validations.validation.mean()

0.6980132450331126

In [16]:
validations.to_csv(path/"validations.csv",index=False)

In [17]:
# ignore this cell
for i in votes.value:
    if "validation_5" in i:
        print(i)

pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validation_5_Liking
pgi_validation_5_Competence
pgi_validati

## Merging Mouse and Validation

In [18]:
mouse_val = pd.merge(mouse, validations, on="user_id",how="left")

In [19]:
mouse_val.head()

Unnamed: 0.1,Unnamed: 0,user_id,action,cord_x,cord_y,radio,time_since,id,created_at,updated_at,timeElapsed,window_x,window_y,validation
0,0,365,s,0,0.0,,0.0,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860,False
1,1,365,m,601,167.0,,0.22,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860,False
2,2,365,m,602,166.0,,0.304,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860,False
3,3,365,c,602,166.0,,0.432,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860,False
4,4,365,m,602,166.0,,0.518,1,2019-03-05 06:56:35.848652,2019-03-05 06:56:35.848652,245.925,1600,860,False


In [20]:
mouse_val.validation.mean() # the mean went up slightly 

0.7311869217532438