In [1]:
import sys
sys.path.append("../src/")
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from PFEmodules import *
pd.options.display.max_columns = 100

In [2]:
input_path = "../data/data-cleaned.csv"
df = pd.read_csv(input_path, index_col = None)
# Dropping useless that came from the import/export
df.drop(columns=["Unnamed: 0"], inplace=True)
pd.options.display.max_columns = 30
pd.options.display.max_rows = 50

### Dataframe preparation
* Coding course based on categories (see functions.py in ../src/PFE-modules)
* Coding years based on a simple format (2A, 3A etc)
* Coding student cohort (year of admission in the school)

In [3]:
# Removing year 1997 for it contains only 1 student
df.drop(df[df["EXERCICE"]==1997].index, inplace=True)

# Adding course code with appropriate function (see functions.py in modules)
df["CODE_COURS"] = df.apply(fn.assign_subject, axis=1)

# Fix columns order
cols=df.columns.tolist()
swap1=cols.pop(1)
swap2=cols.pop(-1)
cols.insert(1, swap2)
cols.append(swap1)
df = df[cols]

# Adding year code
df["CODE_ANNEE"] = df.apply(fn.assign_year, axis=1)

# Fix columns order
cols=df.columns.tolist()
swap1=cols.pop(2)
swap2=cols.pop(-1)
cols.insert(1, swap2)
cols.append(swap1)
df = df[cols]

# Adding cohort code
df["PROMO"] = df.apply(fn.get_promo, axis=1)

# Fix columns order
cols=df.columns.tolist()
swap1=cols.pop(4)
swap2=cols.pop(-1)
cols.insert(3, swap2)
cols.append(swap1)
df = df[cols]

# Dropping year column
try:
    df.drop(columns="LIBETUDE", inplace=True)
except:
    pass

### Creating smaller dfs

In [5]:
# Df counting the number of courses of different categories taken by each student
# on the whole of their academic track
df_tco = (df.groupby("CODELEV")["CODE_COURS"]
         .nunique()
         .reset_index(name="CTO")
          )

# Same as above for a given academic year (2nd, 3rd etc.) 
df_tcbay = (df.groupby(["CODELEV", "CODE_ANNEE"])["CODE_COURS"]
          .nunique()
          .reset_index(name="CTBAY")
          .sort_values(by="CODELEV")
            )

# Creating a df joining the 2 dfs above.
df_tcobay = pd.merge(left=df_tco, right=df_tcbay, left_on="CODELEV", right_on="CODELEV")

df_cat_with_year = pd.merge(left=df_tco, right=df, left_on="CODELEV", right_on="CODELEV")
df_cat_with_year["CAT_OMNI"]=df_cat_with_year.apply(fn.omnivorism_cat, axis=1)
df_cat_with_year.drop(columns=["CTO", "CTO", "CODE_ANNEE"], inplace=True)
df_cat_with_year.drop_duplicates(subset="CODELEV", inplace=True)

df_prepped = df

In [6]:
df_prepped.to_csv("../data/data-prepped.csv")
df_tco.to_csv("../data/df_tco.csv")
df_tcbay.to_csv("../data/df_tcbay.csv")
df_tcobay.to_csv("../data/df_tcobay.csv")
df_cat_with_year.to_csv("../data/df_cat_with_year.csv")