In [None]:
import os 
import pandas as pd 
import numpy as np
import collections

import warnings
warnings.filterwarnings("ignore")

In [None]:
# set working directory (Change for youe)
os.chdir('C:/Users/ander/Google Drive/Columbia/Fall 2019/Capstone/Dotin-Columbia-Castone-Team-Alpha-')

# initilize path
mouse_flat_path = 'Data/Clean Data/mouse_flat_v4.csv'

# read file 
data = pd.read_csv(mouse_flat_path)
data.head()

In [None]:
direction = data[['user_id', 'direction']].values
user_list = set(data.user_id)
page_movement = collections.deque()
for user in user_list: 
    user_segment = direction[direction[:,0] == user]
    count = 1
    for user, action in user_segment:
        if action == 'PageChange ,':
            count += 1
            user_movement = f'page {count}'
            page_movement.append(user_movement)
        else: 
            user_movement = f'page {count}'
            page_movement.append(user_movement)   
            
data.direction = page_movement

In [None]:
# filter users with less than 196 radio count, which is the number of questions in the data 
subset_radio = data.loc[:,['user_id', 'radio']]
subset_radio = subset_radio.dropna(subset = ['radio'])
user_completion = subset_radio.groupby('user_id').count() >= 196
user_completion = user_completion[user_completion['radio'] == True].reset_index()
user_id_who_completed_survey = pd.DataFrame(user_completion['user_id'])
completed_survey_data = data[data.user_id.isin(user_id_who_completed_survey.user_id)].reset_index()
completed_survey_data = completed_survey_data.drop(columns = ['Index','index'])

In [None]:
len(set(completed_survey_data.user_id))

In [None]:
coordinates =  data[['user_id','cord_x', 'cord_y','action', 'direction']].to_numpy()

In [None]:
def parse_distance(coord_data):
    """
    Calculate the incremental coordinate for each mouse movement
    :param coord_data:
    :return: A seperate numpy array for the x, y, user id list.
    """
    user_id = np.unique(coord_data[:, 0])[1:]
    total_x = []
    total_y = []
    user_id_list = []
    action_list = []
    page_list = []

    for user in user_id:
        index = coord_data[:, 0] == user
        new_index = coord_data[index]

        id_updated = new_index[1:, 0]
        action_updated =new_index[1:,3]
        page_updated =new_index[1:,4]

        distance_x = new_index[1:, 1] - new_index[:len(new_index) - 1, 1]
        distance_y = new_index[1:, 2] - new_index[:len(new_index) - 1, 2]

        total_x = np.append([total_x], [distance_x])
        total_y = np.append([total_y], [distance_y])
        
        user_id_list = np.append([user_id_list], [id_updated])
        action_list = np.append([action_list], [action_updated])
        page_list = np.append([page_list], [page_updated])

    return total_x, total_y, user_id_list, action_list, page_list


def parse_directions(id_list, x_coord, y_coord, action):
    """
    Converts coordinate changes all mouse movements into Cardinal direction. Ignores scrolls and clicks 

    :param id_list: Unique list of user ids
    :param x_coord: List of the changes in the x coordinate
    :param y_coord: List of the changes in the y coordinates
    :return: A data frame that contains the user id list, x directional changes, and y directional changes,
    calculated cardinal direction
    """

    directions = collections.deque()  # optimized for append operations
    
    for value in range(0, len(id_list)):

        x = x_coord[value]
        y = y_coord[value]
        
        action_value = action[value]
        
        if action_value == 'm':
            if x == 0 and y == 0:
                directions.append('No Movement')
            elif x > 0 and y == 0:
                directions.append('East')
            elif x < 0 and y == 0:
                directions.append('West')
            elif x == 0 and y > 0:
                directions.append('North')
            elif x == 0 and y < 0:
                directions.append('South')
            elif x > 0 and y > 0:
                directions.append('North East')
            elif x > 0 and y < 0:
                directions.append('North West')
            elif x < 0 and y < 0:
                directions.append('South West')
            elif x < 0 and y > 0:
                directions.append('South East')
            else:
                directions.append('TBD')  
        else: 
            directions.append(action_value)

    return pd.DataFrame({'User Id': id_list,
                         'Distance X': x_coord,
                         'Distance Y': y_coord,
                         'Direction': directions})

In [None]:
# parse distance
coord_x, coord_y, coord_user, action_type, page_list = parse_distance(coordinates)

In [None]:
user_sytem = data[['user_id', 'system']].drop_duplicates()
user_sytem['system'].value_counts()

In [None]:
# parse direction
direction_data = parse_directions(coord_user, coord_x, coord_y, action_type)
direction_data['Page count'] = page_list
direction_data['Direction'].value_counts()

In [None]:
# remove all clicks, scrolls, and np 
direction_data = direction_data[-direction_data.Direction.isin(['c','s','np'])]
direction_data['Direction'].value_counts()

In [None]:
class page_description:
    """
    Class to analyze direction data. Purpose is to determine the movement count thresholdhold for hmm model.  
    """
    
    def __init__(self, page_number):
        self.page_number = page_number
        self.page = direction_data[direction_data['Page count'] == f'page {self.page_number}']

    def page_segment(self):
        """
        Desplays the user id belong to a that segment
        """
        page = direction_data[direction_data['Page count'] == f'page {self.page_number}']
        page = page.groupby('User Id').count()[['Page count']]
        return page

    def page_description(self):
        """
        Provide summary statistics on a page 
        """
        
        return self.page_segment().describe()

    def percentage(self):
        """
        prints the percentage of the total number of counts recorded on this page
        """
        
        value = f' {round((len(self.page) / len(direction_data)) * 100,2)}% of all the mouse movemens are recorded in page {self.page_number}'
        
        return value

Page 1 

In [None]:
page_description(1).percentage()

In [None]:
page_description(1).page_description()

Page 2

In [None]:
page_description(2).percentage()

In [None]:
page_description(2).page_description()

Page 3

In [None]:
page_description(3).percentage()

In [None]:
page_description(3).page_description()

Page 4 

In [None]:
page_description(4).percentage()

In [None]:
page_description(4).page_description()

Page 5 

In [None]:
page_description(5).percentage()

In [None]:
page_description(5).page_description()

Page 6

In [None]:
page_description(6).percentage()

In [None]:
page_description(6).page_description()

Page 7

In [None]:
page_description(7).percentage()

In [None]:
page_description(7).page_description()

Page 8 

In [None]:
page_description(8).percentage()

In [None]:
page_description(8).page_description()

Page 9 

In [None]:
page_description(9).percentage()

In [None]:
page_description(9).page_description()

Page 10 

In [None]:
page_description(10).percentage()

In [None]:
page_description(10).page_description()

Page 11

In [None]:
page_description(11).percentage()

In [None]:
page_description(11).page_description()

Page 12

In [None]:
page_description(12).percentage()

In [None]:
page_description(12).page_description()

Page 13

In [None]:
page_description(13).percentage()

In [None]:
page_description(13).page_description()

Page 14

In [None]:
page_description(14).percentage()

In [None]:
page_description(14).page_description()

Page 15

In [None]:
page_description(15).percentage()

In [None]:
page_description(15).page_description()

In [None]:
observation =  collections.deque()
for user in user_list: 
    user_subset = direction_data[direction_data['User Id'] == user]
    obs_count = 1
    for row in range(0, len(user_subset)):
        observation.append(obs_count)
        obs_count += 1