### 1. Laden Sie die Trainingsdaten.

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None

train_df = pd.read_csv('datasets/train.csv')

train_df.head() # See description above for column description

Unnamed: 0,transactionId,basket,customerType,totalAmount,returnLabel
0,7934161612,[3],existing,77.0,0
1,5308629088,"[5, 3, 0, 3]",existing,64.0,0
2,1951363325,"[3, 3, 1, 4]",new,308.0,1
3,6713597713,[2],existing,74.0,0
4,8352683669,"[4, 4, 4, 4]",new,324.0,1


### 2. Füllen Sie die fehlenden Werte in den Trainingsdaten auf
1, Analysis of missing data  
2. Fill/Remove missing data

In [2]:
# Analysis
heading = "Total number of missing values:"
getSeparator = lambda text: len(text)*"-"

print(getSeparator(heading)) # Just print a nice separator line xD
print(heading)
print(train_df.isnull().sum())
print(getSeparator(heading))

# Calculate the percentage of the cases where the customer type is null
# Use this value to decide, whether this values can be "safely" removed
customerType_isNull_perc = str(round(train_df["customerType"].isnull().sum() * 100 / train_df["customerType"].count(), 2)).replace('.', ',')

print(f'\nThe percentage of missing values in CustomerType is only {customerType_isNull_perc} % ==> Removing missing values should not have a big impact on the resulting model!')

-------------------------------
Total number of missing values:
transactionId      0
basket             0
customerType     517
totalAmount      484
returnLabel        0
dtype: int64
-------------------------------

The percentage of missing values in CustomerType is only 2,11 % ==> Removing missing values should not have a big impact on the resulting model!


In [3]:
# Clean up
# Customer type
train_clean = train_df[train_df['customerType'].notna()]

# Total amount
totalAmount_median = train_clean['totalAmount'].mean()
train_clean['totalAmount'].fillna(totalAmount_median, inplace=True)

print(f'Total number of missing values after cleaning up: {train_clean.isnull().sum().sum()}')

Total number of missing values after cleaning up: 0


### 3. Transformieren Sie die kategorischen Features mittles One-hot-encoding

1. Actually get categorical features
2. One-Hot Encode categorical features from steps 1

In [4]:
# Just a tiny helper for getting the width and print the statement centered  ¯\_(ツ)_/¯ 
# TODO migrate to external helper file
import os 
centeredPrint = lambda statement: print(statement.center(os.get_terminal_size().columns))

#### A) Get a list/Find out categorical columns

> As seen in: https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical

In [5]:
columns = train_clean.columns
# Columns with numerical data
num_cols = train_clean._get_numeric_data().columns
# Now Substract all columns from the numerical ones
categorical_columns = list(set(columns) - set(num_cols))

print("Categorical attributes found: ", *categorical_columns, sep="\n* ")

Categorical attributes found: 
* customerType
* basket


#### B) Hot encode customer type

> Use pandas build in function

In [6]:
one_hot_customerType = pd.get_dummies(train_clean['customerType'])

In [7]:
centeredPrint('One hot encoding for the customer type:\n\n')
one_hot_customerType.head()

                                                                One hot encoding for the customer type:

                                                                 


Unnamed: 0,existing,new
0,1,0
1,1,0
2,0,1
3,1,0
4,0,1


#### B) Hot encode the basket values

> We'll do this manually

<hr>
Get the max/min values in the basket feature lists
<hr>

In [8]:
# TODO migrate as helper to external file

import re # RegEx

'''
Convert's a string list into a regular list:
F.ex. [0, 1, 2, 3] from type **String** -> to type **Int**

Let's deconstruct this:

1. In the for loop: Convert the 'String' list into a regular list by:
    * Remove the Square brackets
    * Split the string into a list
2. For each element in the created list convert it from string to int

Könnnen es gerne vereinfachen, wenn es wärend der Präzi zu verwirrungen kommen könnte 😅

'''
getIntListFromStringList = lambda stringList: [int(listElement) for listElement in re.sub("[\[\]]", "", stringList).split(',')]
 
min_basket_value = min([min(getIntListFromStringList(list)) for list in train_clean['basket']]) 
max_basket_value = max([max(getIntListFromStringList(list)) for list in train_clean['basket']]) 

print(f'The minimal value basket element is {min_basket_value} and the max basket value is {max_basket_value}')

The minimal value basket element is 0 and the max basket value is 5


<hr>
Create new features based on the elements in the basket:
<hr>

In [9]:
# 1. Create new basket labels

# List with basket elements
# NOTE: We assume, that we have all these values in the basket label array
#       We did NOT test this! Maybe test this to have more security...
# TODO check whether all values are actually in the list
basketElements = list(range(min_basket_value, max_basket_value+1))

# Data frame with columns: 'b_0 | b_1 | ...' for each of our basket elements
one_hot_basket = pd.DataFrame([], columns=[f'b_{basketElement}' for basketElement in basketElements])

'''
Do the one hot encoding for the basket feature.
Actually just:
    1. Check for the current basket, whether the element is present
    2. If present, set the encoding bit
'''
for basketElement in basketElements:
      one_hot_basket[f'b_{basketElement}'] = train_clean['basket'].apply(lambda x: x.count(str(basketElement)))

one_hot_basket.head()

Unnamed: 0,b_0,b_1,b_2,b_3,b_4,b_5
0,0,0,0,1,0,0
1,1,0,0,2,0,1
2,0,1,0,2,1,0
3,0,0,1,0,0,0
4,0,0,0,0,4,0


#### C) Concatenate it back into the original dataframe

> Use pandas build in function

1. Concatenate
2. Clean up no more needed features

In [10]:
train_all = pd.concat([train_clean, one_hot_customerType, one_hot_basket], axis=1)

# Clean up: # Drop Basket and customerType as they are no longer needed
train_all = train_all.drop(columns=['basket', 'customerType'])

train_all.head()

Unnamed: 0,transactionId,totalAmount,returnLabel,existing,new,b_0,b_1,b_2,b_3,b_4,b_5
0,7934161612,77.0,0,1,0,0,0,0,1,0,0
1,5308629088,64.0,0,1,0,1,0,0,2,0,1
2,1951363325,308.0,1,0,1,0,1,0,2,1,0
3,6713597713,74.0,0,1,0,0,0,1,0,0,0
4,8352683669,324.0,1,0,1,0,0,0,0,4,0


### 4. Versuchen Sie auf Basis des Attributs basket Features zu bauen (z.B. wie oft kommt jede Kategorie im Basket vor).

In [11]:
# TODO Was war der unterschied zu dem was oben gemacht wurde?

### 5. Skalieren Sie die Features mit einem StandardScaler.

> Use sklearn scaler

1. Drop target feature
2. scale the x results

In [14]:
from sklearn.preprocessing import StandardScaler

# TODO better naming(See other files already done)

x = train_all.drop(columns=['returnLabel'])
scaler = StandardScaler()

# Our x/y values
X = scaler.fit_transform(x)
y = train_all['returnLabel'].values

### 6. Trainieren Sie die folgenden Klassifikationsmodelle und probieren Sie die angegebenen Hyperparameter mittels Cross-Validation aus:

> DOING

In [19]:
from sklearn.linear_model import LogisticRegression

'''
DOING
'''

#1. Logistische Regression: C :[0.1,1,4,5,6,10,30,100] und penalty: ["l1", "l2"]

# logisticRegr = LogisticRegression(max_iter=1000, penalty = ["l1", "l2"], C=[0.1,1,4,5,6,10,30,100])
# logisticRegr.fit(X, y)

#2. Random Forest: n_estimators: [60,80,100,120,140] und max_depth: [2, 3, 4, 5]

# model = RandomForestClassifier(random_state=0)
# model.fit(X, y)

#3. Gradient Boosting Tree: gleiche Hyperparameter wie bei Random Forest

# parameter_candidates = [{'n_estimators': [60,80,100,120,140], 'max_depth': [2, 3, 4, 5]}]
# gbt = GradientBoostingClassifier(random_state=0)
# clf_cv = GridSearchCV(estimator=gbt, param_grid=parameter_candidates, n_jobs=-1)
# clf_cv.fit(X, y)
