# Data Cleanup
This notebook cleans the data and removes duplicate values

In [85]:
import pandas as pd
import numpy as np

## Load Data
Load data and print number of points

In [86]:
mathclass = pd.read_csv("student-mat.csv",delimiter=";") #math course csv
portclass = pd.read_csv("student-por.csv",delimiter=";") #portuguese language course csv

print('Math class data points: {}'.format(len(mathclass)))
print('Portuguese class data points: {}'.format(len(portclass)))
print('Total data points: {}'.format(len(mathclass)+len(portclass)))

Math class data points: 395
Portuguese class data points: 649
Total data points: 1044


## Remap Categorical Variables
Categorical variables need to be mapped to numbers

In [87]:
mathclass['school_num'] = mathclass['school'].map({'GP':0, 'MS':1})
mathclass['sex_num'] = mathclass['sex'].map({'F':0, 'M':1})
mathclass['address_num'] = mathclass['address'].map({'U':0, 'R':1})
mathclass['famsize_num'] = mathclass['famsize'].map({'LE3':0, 'GT3':1})
mathclass['Pstatus_num'] = mathclass['Pstatus'].map({'T':0, 'A':1})
mathclass['Mjob_num'] = mathclass['Mjob'].map({'teacher':0, 'health':1, 'services':2, 'at_home':3, 'other':4})
mathclass['Fjob_num'] = mathclass['Fjob'].map({'teacher':0, 'health':1, 'services':2, 'at_home':3, 'other':4})
mathclass['reason_num'] = mathclass['reason'].map({'home':0, 'reputation':1, 'course':2, 'other':3})
mathclass['guardian_num'] = mathclass['guardian'].map({'mother':0, 'father':1, 'other':2})
mathclass['schoolsup_num'] = mathclass['schoolsup'].map({'yes':0, 'no':1})
mathclass['famsup_num'] = mathclass['famsup'].map({'yes':0, 'no':1})
mathclass['paid_num'] = mathclass['paid'].map({'yes':0, 'no':1})
mathclass['activities_num'] = mathclass['activities'].map({'yes':0, 'no':1})
mathclass['nursery_num'] = mathclass['nursery'].map({'yes':0, 'no':1})
mathclass['higher_num'] = mathclass['higher'].map({'yes':0, 'no':1})
mathclass['internet_num'] = mathclass['internet'].map({'yes':0, 'no':1})
mathclass['romantic_num'] = mathclass['romantic'].map({'yes':0, 'no':1})
mathclass['class_num'] = 0    # Math class = 0, portugese = 1, both = 2

portclass['school_num'] = portclass['school'].map({'GP':0, 'MS':1})
portclass['sex_num'] = portclass['sex'].map({'F':0, 'M':1})
portclass['address_num'] = portclass['address'].map({'U':0, 'R':1})
portclass['famsize_num'] = portclass['famsize'].map({'LE3':0, 'GT3':1})
portclass['Pstatus_num'] = portclass['Pstatus'].map({'T':0, 'A':1})
portclass['Mjob_num'] = portclass['Mjob'].map({'teacher':0, 'health':1, 'services':2, 'at_home':3, 'other':4})
portclass['Fjob_num'] = portclass['Fjob'].map({'teacher':0, 'health':1, 'services':2, 'at_home':3, 'other':4})
portclass['reason_num'] = portclass['reason'].map({'home':0, 'reputation':1, 'course':2, 'other':3})
portclass['guardian_num'] = portclass['guardian'].map({'mother':0, 'father':1, 'other':2})
portclass['schoolsup_num'] = portclass['schoolsup'].map({'yes':0, 'no':1})
portclass['famsup_num'] = portclass['famsup'].map({'yes':0, 'no':1})
portclass['paid_num'] = portclass['paid'].map({'yes':0, 'no':1})
portclass['activities_num'] = portclass['activities'].map({'yes':0, 'no':1})
portclass['nursery_num'] = portclass['nursery'].map({'yes':0, 'no':1})
portclass['higher_num'] = portclass['higher'].map({'yes':0, 'no':1})
portclass['internet_num'] = portclass['internet'].map({'yes':0, 'no':1})
portclass['romantic_num'] = portclass['romantic'].map({'yes':0, 'no':1})
portclass['class_num'] = 1    # Math class = 0, portugese = 1, both = 2


## Duplicate Deletion
As described in [student.txt](./student.txt), 382 students belong to both datasets. The final dataset is the combination of the portugese class and the math class, and each student only appears once.

In [88]:
# Find duplicates that are in both math and portuguese class datasets
duplicates = pd.merge(portclass, mathclass, how='inner', on=['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet'])
print('Number of duplicates: {}'.format(len(duplicates[duplicates == True])))

# Merge 2 datasets into single dataframe
bothclass = pd.concat([portclass, mathclass])
print('Length of concatenated datasets from both classes: {}'.format(len(bothclass)))

# Set duplicate values to have class category of 2, or both classes
duplicates = bothclass.duplicated(subset=['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet'], keep='last')
bothclass.loc[duplicates, 'class_num'] = 2

# Remove other duplicates
bothclass = bothclass.drop_duplicates(keep='first', subset=['school','sex','age','address','famsize','Pstatus','Medu','Fedu','Mjob','Fjob','reason','nursery','internet'])
bothclass = bothclass.reset_index(drop=True)
print('Length of dataset after duplicates removed: {}'.format(len(bothclass)))

Number of duplicates: 382
Length of concatenated datasets from both classes: 1044
Length of dataset after duplicates removed: 662


## Check

In [89]:
with pd.option_context('display.max_rows', 999, 'display.max_columns', 5):
    print(bothclass)

    school sex    ...     romantic_num class_num
0       GP   F    ...                1         2
1       GP   F    ...                1         2
2       GP   F    ...                1         2
3       GP   F    ...                0         2
4       GP   F    ...                1         2
5       GP   M    ...                1         2
6       GP   M    ...                1         2
7       GP   F    ...                1         2
8       GP   M    ...                1         2
9       GP   M    ...                1         2
10      GP   F    ...                1         2
11      GP   F    ...                1         2
12      GP   M    ...                1         2
13      GP   M    ...                1         2
14      GP   M    ...                0         2
15      GP   F    ...                1         2
16      GP   F    ...                1         2
17      GP   F    ...                1         2
18      GP   M    ...                1         2
19      GP   M    ..

## Notes
Right now, some of the numbers don't add up. There are 395 math students, but 382 of them are also in the portugese class. This means 13 of them should be classified as '0', or only in the math class. But, there are over 30 of them only in the math class, which is only possible if some of the duplicates are in the data multiple times

## Save Data
Save data to csv to load easily in future notebooks

In [90]:
bothclass.to_csv('CleanData.csv', sep=',', na_rep=np.nan)