In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from copy import deepcopy
from datetime import datetime
from scipy import stats
from collections import Counter

from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency

plt.style.use("seaborn")

u, s = "µ", "σ"

In [2]:
def test_normal(x):
    k2, p = stats.normaltest(x)
    alpha = 1e-3
    print("p = {:g}".format(p))
    if p < alpha:  # null hypothesis: x comes from a normal distribution
        print("It's not a normal distribution")
    else:
        print("It's a normal distribution")
        
    plt.hist(x)
    plt.show()

In [7]:
def stackedbarchart_from_contingency(contingencyTable,xlabel=None,title=None):
    percentages = []


    for i,target_col in enumerate(contingencyTable.columns):
        percentage_onelabel = []
        for j,label in enumerate(contingencyTable.index):
            val = contingencyTable.loc[label][target_col]
            summe = contingencyTable.loc[label].sum()
            percentage = (val/summe) *100
            percentage_onelabel.append(percentage)

        val = contingencyTable[target_col].sum()
        total = contingencyTable.sum().sum()

        percentage_onelabel.append(val*100/total)
        percentages.append(percentage_onelabel)


    percentages_backup = deepcopy(percentages)
    # print('per',percentages)

    # add up preceeding percentages for barchart
    for i in range(1,len(percentages)):
        for j in range(0,len(percentages[i])):
            percentages[i][j] += percentages[i-1][j]

    # print(percentages,contingencyTable.index)
    colors = ['grey','lightgrey','black','dimgrey']
    plt.figure(figsize=(5,5))
    
    # print(percentages)
    
    labels = list(contingencyTable.index)+['total']
    labels = [str(v) for v in labels]

    for i in range(len(percentages)-1,-1,-1):
        
        plt.bar(labels,percentages[i],label=contingencyTable.columns[i],color=colors[i])
    
    plt.ylabel('Percentage')
    plt.xlabel(xlabel)
    plt.title(title)
    
    plt.legend(loc="upper right")
    plt.show()

    # print('In Percent ',list(contingencyTable.index)+['total'])

    data = []
    data_transposed = []
    for i in range(len(percentages_backup)-1,-1,-1):
        # print(contingencyTable.columns[i],percentages_backup[i])
        data_transposed.append(percentages_backup[i])

    data_transposed = list(reversed(data_transposed))
    for i in range(len(data_transposed[0])):
        row = []
        for j in range(len(data_transposed)):
            row.append(data_transposed[j][i])
        
        data.append(row)
    
    df = pd.DataFrame(data=data,columns=contingencyTable.columns, index=list(contingencyTable.index)+['total'])
    try:
        totalratio = df.loc['total'][0] /df.loc['total'][1]
        df['Abweichung von Total'] = df.iloc[:,0]/df.iloc[:,1] - totalratio
    except:
        print('Cant generate total')

    print(df)
    print("\n\n")

In [5]:
crosstab = pd.crosstab(df["job"], df["classification"]) # classification needs to be binary
chi2, pval, dof, expectedFreq=chi2_contingency(crosstab) # single features as inputs

cl_good = df[df["classification"] == "good"]["duration [month]"] # creation of two df series based on binary
cl_bad = df[df["classification"] == "bad"]["duration [month]"]
t_val, p_val = ttest_ind(cl_good, cl_bad) # pval for each feature

# Übersicht

In [8]:
df = pd.read_csv("data/Superstore_OrderAndReturns.csv")
print(df.shape)
df.head(1)

(9994, 22)


Unnamed: 0,orderReturned,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,False,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136


In [9]:
df.columns

Index(['orderReturned', 'Row ID', 'Order ID', 'Order Date', 'Ship Date',
       'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country',
       'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'],
      dtype='object')

In [17]:
df["order_dt"] = [datetime.strptime(date, '%Y-%m-%d') for date in df["Order Date"].to_list()]
df["ship_dt"] = [datetime.strptime(date, '%Y-%m-%d') for date in df["Ship Date"].to_list()]

# Aufgabe 1

In [27]:
diffs = [df.iloc[i]["ship_dt"] - df.iloc[i]["order_dt"] for i in range(df.shape[0])]
df["dt_diff"] = [date.days for date in diffs]
df.head(1)

Unnamed: 0,orderReturned,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,...,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,order_dt,ship_dt,dt_diff
0,False,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,...,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,2016-11-08,2016-11-11,3


In [31]:
df_ten_days = df[(df["dt_diff"] >= 0) & (df["dt_diff"] <= 10)]
df_ten_days.shape

(9994, 25)

# Aufgabe 2

# Aufgabe 3

# Aufgabe 4