# (3) Cleaning Student NMP ID

* **author** = Diego Sapunar-Opazo
* **copyright** = Copyright 2019, Thesis M.Sc. Diego Sapunar - Pontificia Universidad Católica de Chile
* **credits** = Diego Sapunar-Opazo, Ronald Perez, Mar Perez-Sanagustin, Jorge Maldonado-Mahauad
* **maintainer** = Diego Sapunar-Opazo
* **email** = dasapunar@uc.cl
* **status** = Dev

This script gets from the NMP's database users_id_lms, users and students.csv to create a .csv filw with two columns:

(1) **num_alumno**, which corresponds to the internal face-to-face students' id and

(2) **NMP_user_id**, which corresponds to NMP's internal id

## Part 0: Import Packages

In [4]:
# data analysis and wrangling
import pandas as pd
import numpy as np

## Part 1: Getting the Data

In [5]:
def read_data(path):
    '''
    Read a .csv file and convert it in a Pandas DataFrame.
    
    Input:
    path - String: path where the .csv is located.
    
    Output:
    Pandas DataFrame: .csv in the Pandas DataFrame format.
    '''
    return pd.read_csv(path)

## Part 2: Data Preprocessing

In [6]:
def preprocc_data(df, slices=False, columns_to_rename=False, categories=False):
    '''
    From a dataframe on the fly, (1) get the necessary columns; (2) rename columns; and (3) clean data.
    
    Input: 
    df - Pandas DataFrame: dataframe to be cleaned.
    columns_to_rename - Dict: Columns to rename, Key: original name, Value: new name.
    categories - List of Strings: List of the names of the columns to be category type. If you renamed some columns, should be the new names.
    
    Output:
    df - Pandas DataFrame: the dataframe already cleaned.
    '''
    
    df_cleaned = df.copy()
    
    # slicing the columns, getting only the one that I need (num_alumno and seccion)
    if slices:
        df_cleaned = df_cleaned.iloc[:,slices]
    
    del df  # clean memory
    
    # rename columns
    if columns_to_rename:
        df_cleaned.rename(_columns_to_rename, 
                          inplace=True, 
                          axis=1)
    
    if categories:
        for cat in categories:
            # creating categories
            df_cleaned[cat] = df_cleaned[cat].astype('category')
    
    return df_cleaned

In [7]:
def merging(df1, df2, variable):
    '''
    Merge df1 and df2 over the variable.
    
    Input:
    df1 - Pandas DataFrame
    df2 - Pandas DataFrame
    variable - String: name of the column to use as pivot.
    
    Output:
    Pandas DataFrame
    '''

    df1.dropna(inplace=True)
    df2.dropna(inplace=True)
    
    # getting same types
    df1[variable] = df1[variable].astype('str')
    df2[variable] = df2[variable].astype('str')
    
    return pd.merge(left=df1, right=df2, left_on=variable, right_on=variable)
    

In [105]:
def get_students(df, df_students):
    '''
    Maintain only de nmp in df that are in the df_students.
    
    Input:
    df_lms - Pandas DataFrame: df with the lms ids
    df_students - Pandas DataFrame: df with the students that are important for me!
    
    Output:
    Pandas DataFrame: With the same structure of df_lms, filtered with the students thar are important.
    '''
    # getting same types
    df['num_alumno'] = df['num_alumno'].astype('str')
    df_students['num_alumno'] = df_students['num_alumno'].astype('str')
    
    return df_students.merge(df, left_on=['num_alumno', 'lms_id'], right_on=['num_alumno', 'lms_id'])

## Part 3: Export Data

In [9]:
def export_data(df, path, columns_to_drop=False):
    '''
    Export df in .csv file to the path.
    
    Input:
    df - Pandas DataFrame: dataframe to be exported.
    path - String: path where the .csv will be exported.
    '''
    if columns_to_drop:
        df.drop(columns_to_drop, axis=1, inplace=True)
        
    df.to_csv(path, index=False)

## Part 4: Main

In [109]:
_users_path = '../data/raw_data/NMP/users.csv'
_users_id_lms_path = '../data/raw_data/NMP/users_id_lms.csv'
_students_path = '../data/clean_data/students_lms_id.csv'
_export_path = '../data/clean_data/students_NMP_id.csv'


_columns_to_rename = {
    'user_id': 'NMP_user_id',
    'lms_user_id': 'lms_id',
    'user_id_lms': 'lms_id',
    'user_id_review': 'num_alumno'
}


df_users = preprocc_data(read_data(_users_path), 
                         slices=[0,1], 
                         columns_to_rename=_columns_to_rename, 
                         categories=False)

df_lms_id = preprocc_data(read_data(_users_id_lms_path), 
                          slices=[0,1], 
                          columns_to_rename=_columns_to_rename, 
                          categories=False)

df = merging(df_users, df_lms_id, 'lms_id')

df_aux = get_students(df, read_data(_students_path))

export_data(df_aux, 
            _export_path,
           ['lms_id'])

In [111]:
len(df_aux)

124