In [7]:
import os 
import pandas as pd 
import numpy as np
import collections

import warnings
warnings.filterwarnings("ignore")

In [8]:
def parse_distance(coord_data):
    """
    Calculate the incremental coordinate for each mouse movement
    :param coord_data:
    :return: A seperate numpy array for the x, y, user id list.
    """
    user_id = np.unique(coord_data[:, 0])[1:]
    total_x = []
    total_y = []
    user_id_list = []
    action_list = []
    page_list = []

    for user in user_id:
        index = coord_data[:, 0] == user
        new_index = coord_data[index]

        id_updated = new_index[1:, 0]
        action_updated =new_index[1:,3]
        page_updated =new_index[1:,4]

        distance_x = new_index[1:, 1] - new_index[:len(new_index) - 1, 1]
        distance_y = new_index[1:, 2] - new_index[:len(new_index) - 1, 2]

        total_x = np.append([total_x], [distance_x])
        total_y = np.append([total_y], [distance_y])
        
        user_id_list = np.append([user_id_list], [id_updated])
        action_list = np.append([action_list], [action_updated])
        page_list = np.append([page_list], [page_updated])

    return total_x, total_y, user_id_list, action_list, page_list


def parse_directions(id_list, x_coord, y_coord, action):
    """
    Converts coordinate changes all mouse movements into Cardinal direction. Ignores scrolls and clicks 

    :param id_list: Unique list of user ids
    :param x_coord: List of the changes in the x coordinate
    :param y_coord: List of the changes in the y coordinates
    :return: A data frame that contains the user id list, x directional changes, and y directional changes,
    calculated cardinal direction
    """

    directions = collections.deque()  # optimized for append operations
    
    for value in range(0, len(id_list)):

        x = x_coord[value]
        y = y_coord[value]
        
        action_value = action[value]
        
        if action_value == 'm':
            if x == 0 and y == 0:
                directions.append('No Movement')
            elif x > 0 and y == 0:
                directions.append('East')
            elif x < 0 and y == 0:
                directions.append('West')
            elif x == 0 and y > 0:
                directions.append('North')
            elif x == 0 and y < 0:
                directions.append('South')
            elif x > 0 and y > 0:
                directions.append('North East')
            elif x > 0 and y < 0:
                directions.append('North West')
            elif x < 0 and y < 0:
                directions.append('South West')
            elif x < 0 and y > 0:
                directions.append('South East')
            else:
                directions.append('TBD')  
        else: 
            directions.append(action_value)

    return pd.DataFrame({'User Id': id_list,
                         'Distance X': x_coord,
                         'Distance Y': y_coord,
                         'Direction': directions})

class page_description:
    """
    Class to analyze direction data. Purpose is to determine the movement count thresholdhold for hmm model.  
    """
    def __init__(self, data,page_number):
        self.page_number = page_number
        self.page = direction_data[direction_data['Page count'] == f'page {self.page_number}']
        self.data = data 

    def page_segment(self):
        """
        Desplays the user id belong to a that segment
        """
        page = self.data[self.data['Page count'] == f'page {self.page_number}']
        page = page.groupby('User Id').count()[['Page count']]
        return page

    def page_description(self):
        """
        Provide summary statistics on a page 
        """
        
        return self.page_segment().describe()

    def percentage(self):
        """
        prints the percentage of the total number of counts recorded on this page
        """
        
        value = f' {round((len(self.page) / len(self.data)) * 100,2)}% of all the mouse movemens are recorded in page {self.page_number}'
        
        return value

In [9]:
# set working directory (Change for youe)
os.chdir('C:/Users/ander/Google Drive/Columbia/Fall 2019/Capstone/Dotin-Columbia-Castone-Team-Alpha-')

# initilize path
mouse_flat_path = 'Data/Clean Data/mouse_flat_v4.csv'

# read file 
data = pd.read_csv(mouse_flat_path)

In [10]:
# remove duplicates 
user_id_seqence = []

for i in range(1, len(data)): 
    if data.iloc[i,2] == data.iloc[i-1,2]:
        continue 
    else: 
        user_id_seqence.append(data.iloc[i,2])
        
user_id_seq = pd.DataFrame({'user_id': user_id_seqence})
user_id_seq = user_id_seq['user_id'].value_counts()
multiple_survey = user_id_seq[user_id_seq > 1]
duplicate_ = multiple_survey.index
data = data[~data['user_id'].isin(duplicate_)]

In [11]:
# add page numbers 
direction = data[['user_id', 'direction']].values
user_list = set(data.user_id)
page_movement = collections.deque()
for user in user_list: 
    user_segment = direction[direction[:,0] == user]
    count = 1
    for user, action in user_segment:
        if action == 'PageChange ,':
            count += 1
            user_movement = f'page {count}'
            page_movement.append(user_movement)
        else: 
            user_movement = f'page {count}'
            page_movement.append(user_movement)   
            
data.direction = page_movement

In [12]:
# # filter users with less than 196 radio count, which is the number of questions in the data 
# subset_radio = data.loc[:,['user_id', 'radio']]
# subset_radio = subset_radio.dropna(subset = ['radio'])
# user_completion = subset_radio.groupby('user_id').count() >= 196
# user_completion = user_completion[user_completion['radio'] == True].reset_index()
# user_id_who_completed_survey = pd.DataFrame(user_completion['user_id'])
# completed_survey_data = data[data.user_id.isin(user_id_who_completed_survey.user_id)].reset_index()
# completed_survey_data = completed_survey_data.drop(columns = ['Index','index'])

In [13]:
coordinates =  data[['user_id','cord_x', 'cord_y','action', 'direction']].to_numpy()

In [14]:
# parse distance
coord_x, coord_y, coord_user, action_type, page_list = parse_distance(coordinates)

In [15]:
user_sytem = data[['user_id', 'system']].drop_duplicates()
user_sytem['system'].value_counts()

pc         673
Android      4
unknown      4
IpadOld      2
Windows      1
Name: system, dtype: int64

In [16]:
# parse direction
direction_data = parse_directions(coord_user, coord_x, coord_y, action_type)
direction_data['Page count'] = page_list
direction_data['Direction'].value_counts()

East           889368
North          842736
North East     817002
West           763493
South East     753051
South West     667311
North West     525782
South          485480
No Movement    215215
s              187356
c              169192
np                139
Name: Direction, dtype: int64

In [17]:
# remove all clicks, scrolls, and np 
direction_data = direction_data[-direction_data.Direction.isin(['c','s','np'])]
direction_data['Direction'].value_counts()

East           889368
North          842736
North East     817002
West           763493
South East     753051
South West     667311
North West     525782
South          485480
No Movement    215215
Name: Direction, dtype: int64

Page 1 

In [18]:
page_description(direction_data, 1).percentage()

' 36.14% of all the mouse movemens are recorded in page 1'

In [19]:
page_description(direction_data,1).page_description()

Unnamed: 0,Page count
count,550.0
mean,3915.749091
std,5149.885072
min,1.0
25%,352.0
50%,1087.5
75%,7118.0
max,25945.0


Page 2

In [20]:
page_description(direction_data,2).percentage()

' 7.04% of all the mouse movemens are recorded in page 2'

In [21]:
page_description(direction_data,2).page_description()

Unnamed: 0,Page count
count,369.0
mean,1136.425474
std,935.158177
min,2.0
25%,458.0
50%,938.0
75%,1537.0
max,5327.0


Page 3

In [22]:
page_description(direction_data,3).percentage()

' 6.58% of all the mouse movemens are recorded in page 3'

In [23]:
page_description(direction_data,3).page_description()

Unnamed: 0,Page count
count,354.0
mean,1107.042373
std,1061.908916
min,5.0
25%,444.25
50%,788.0
75%,1451.75
max,9199.0


Page 4 

In [24]:
page_description(direction_data,4).percentage()

' 5.45% of all the mouse movemens are recorded in page 4'

In [25]:
page_description(direction_data,4).page_description()

Unnamed: 0,Page count
count,343.0
mean,946.673469
std,787.10108
min,11.0
25%,387.0
50%,708.0
75%,1310.5
max,3613.0


Page 5 

In [26]:
page_description(direction_data,5).percentage()

' 5.14% of all the mouse movemens are recorded in page 5'

In [27]:
page_description(direction_data,5).page_description()

Unnamed: 0,Page count
count,328.0
mean,934.29878
std,773.012679
min,1.0
25%,402.25
50%,721.0
75%,1299.5
max,5255.0


Page 6

In [28]:
page_description(direction_data,6).percentage()

' 4.3% of all the mouse movemens are recorded in page 6'

In [29]:
page_description(direction_data,6).page_description()

Unnamed: 0,Page count
count,306.0
mean,837.986928
std,696.389796
min,3.0
25%,374.25
50%,667.5
75%,1090.5
max,5531.0


Page 7

In [30]:
page_description(direction_data,7).percentage()

' 4.09% of all the mouse movemens are recorded in page 7'

In [31]:
page_description(direction_data,7).page_description()

Unnamed: 0,Page count
count,294.0
mean,828.462585
std,693.333859
min,12.0
25%,389.75
50%,672.0
75%,1087.75
max,4827.0


Page 8 

In [32]:
page_description(direction_data,8).percentage()

' 3.69% of all the mouse movemens are recorded in page 8'

In [33]:
page_description(direction_data,8).page_description()

Unnamed: 0,Page count
count,283.0
mean,777.823322
std,653.083144
min,15.0
25%,332.5
50%,591.0
75%,1110.5
max,4340.0


Page 9 

In [34]:
page_description(direction_data,9).percentage()

' 3.41% of all the mouse movemens are recorded in page 9'

In [35]:
page_description(direction_data,9).page_description()

Unnamed: 0,Page count
count,267.0
mean,761.505618
std,600.190279
min,5.0
25%,377.0
50%,584.0
75%,1060.0
max,3104.0


Page 10 

In [36]:
page_description(direction_data,10).percentage()

' 3.18% of all the mouse movemens are recorded in page 10'

In [37]:
page_description(direction_data,10).page_description()

Unnamed: 0,Page count
count,255.0
mean,742.411765
std,1032.388283
min,5.0
25%,343.0
50%,526.0
75%,989.5
max,14741.0


Page 11

In [38]:
page_description(direction_data,11).percentage()

' 2.89% of all the mouse movemens are recorded in page 11'

In [39]:
page_description(direction_data,11).page_description()

Unnamed: 0,Page count
count,226.0
mean,761.575221
std,682.157118
min,2.0
25%,323.5
50%,548.5
75%,996.0
max,4134.0


Page 12

In [40]:
page_description(direction_data,12).percentage()

' 2.35% of all the mouse movemens are recorded in page 12'

In [41]:
page_description(direction_data,12).page_description()

Unnamed: 0,Page count
count,212.0
mean,659.849057
std,513.219238
min,9.0
25%,308.75
50%,564.0
75%,920.25
max,2663.0


Page 13

In [42]:
page_description(direction_data,13).percentage()

' 1.92% of all the mouse movemens are recorded in page 13'

In [43]:
page_description(direction_data,13).page_description()

Unnamed: 0,Page count
count,191.0
mean,598.08377
std,550.960839
min,2.0
25%,255.5
50%,506.0
75%,775.5
max,4872.0


Page 14

In [44]:
page_description(direction_data,14).percentage()

' 1.64% of all the mouse movemens are recorded in page 14'

In [45]:
page_description(direction_data,14).page_description()

Unnamed: 0,Page count
count,166.0
mean,587.060241
std,486.408516
min,14.0
25%,271.5
50%,494.0
75%,805.5
max,3239.0


Page 15

In [46]:
page_description(direction_data,15).percentage()

' 1.26% of all the mouse movemens are recorded in page 15'

In [47]:
page_description(direction_data,15).page_description()

Unnamed: 0,Page count
count,146.0
mean,516.212329
std,417.794673
min,4.0
25%,220.0
50%,411.0
75%,707.5
max,2050.0


In [48]:
page_number_list =[]
for value in range(1,16):
    page_ = f'page {value}'
    page_number_list.append(page_)

In [49]:
page_count = [1]
count = 1

for row in range(1,len(direction_data)): 
    if direction_data.iloc[row,4] == direction_data.iloc[row-1,4]: 
        count += 1
        page_count.append(count)
    else: 
        count = 1
        page_count.append(count)

In [50]:
# add observation
direction_data['observation'] = page_count

In [51]:
# remove all pages greater than 15 
direction_data = direction_data[direction_data['Page count'].isin(page_number_list)]

In [52]:
direction_data.to_csv(f'Models/Q1_Mouse Activity/Data/direction_data.csv')