# F1 Loading data in a dataframe

In [1]:
def load_data(filetype):
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pyodbc
    valid = {'sql','csv','excel'}
    if filetype not in valid:
        raise ValueError("filetype must be one of %r." % valid)
    if filetype=='csv':
        path=input('Enter path of file')
        df=pd.read_csv(path)
    elif filetype=='excel':
        path=input('Enter path of file')
        df=pd.read_excel(path)
    else:
        query=input('Enter sql query')
        #connect_params=input('Enter connection parameters')
        conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DESKTOP-AIS9287\SQLEXPRESS;'
                      'Database=IBA_23371;'# Parameters can be altered accordingly
                      'Trusted_Connection=yes;')
        cursor = conn.cursor()
        df = pd.read_sql_query(query,conn)
    return df



In [37]:
def overview(df,head,n):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    print('The shape is :',df.shape)
    print('The column data types are: \n')
    print(df.dtypes)
    print('\n')
    valid = {'head','tail'}
    if head not in valid:
        raise ValueError("Either head or tail")
    if head=='head':
        print('The first '+str(n)+' rows are :')
        display(df.head(n))
    else:
        print('The last '+str(n)+' rows are :')
        display(df.tail(n))
    

# F3  Remove unnecessary/useless columns

In [36]:
def remove_cols(df,*columns):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    cols=[]
    for i in columns:
        cols.append(i)
    df.drop(columns=cols,inplace=True)
    

# F4 Remove rows containing a particular value of a given column

In [35]:
def drop_rows(df,column,value):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    df.drop(df.loc[df[column]==value].index,inplace=True)

# F5 Determine the missing values in the whole dataset

In [34]:
def missing_details(df):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    b = pd.DataFrame()
    b['Missing value, %'] = round(df.isnull().sum()/df.shape[0]*100)
    b['Missing value count']=df.isnull().sum()
    b['N unique value'] = df.nunique()
    return b 

# F6 Analyze missing values of one or more columns using mano module

In [33]:
def missing_analysis(df,*cols):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import missingno as mn
    x=[]
    for i in cols:
        x.append(i)
    df2=df[x]
    chart_type=input('Enter the type of chart required')
    if chart_type=='bar':
        mn.bar(df2)
    elif chart_type=='matrix':
        mn.matrix(df2)
    elif chart_type=='heatmap':
        mn.heatmap(df2)
    elif chart_type=='dendrogram':
        mn.dendrogram(df2)
    else:
        raise ValueError('Chart type must be on of bar,matrix,heatmap or dendrogram')

# F7 Cater for missing values

In [32]:
def handle_mv(df,column,method):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    if method=='mean':
        df[column].fillna(df[column].mean(),inplace=True)
    elif method=='median':
        df[column].fillna(df[column].median(),inplace=True)
    elif method=='mode':
        df[column].fillna(df[column].mode()[0],inplace=True)
    elif method=='value':
        x=input('Enter value to replace null values')
        df[column].fillna(x,inplace=True)
    elif method=='interpolation':
        x=input('forward or backward interpolation?')
        if x=='forward':
            df[column] = df['column'].interpolate(method ='linear', limit_direction ='forward')
        elif x=='backward':
            df[column] = df['column'].interpolate(method ='linear', limit_direction ='backward')
        else:
            raise ValueError('Select on from forward or backward')
    elif method=='KNN':
        from sklearn.impute import KNNImputer
        neighbors=int(input('Enter number of number of neighbors'))
        df2=df.copy()
        imputer = KNNImputer (n_neighbors=neighbors)
        y=[]
        n=int(input('Enter the number of columns to fit KNN imputer'))
        for i in range(n):
            x=input('Enter name of column')
            y.append(x)
        y.append(column)
        df[y] = imputer.fit_transform(df[y])
        x_axis=input('x axis label')
        y_axis=input('y axis label')
        nulls=df2[x_axis].isna()+df2[y_axis].isna()
        df.plot(x=x_axis,y=y_axis,kind='scatter',alpha=0.5,c=nulls,cmap='rainbow')
        plt.show()
    elif method=='group_mode':        
        x=input('Enter name of second column')
        df2=df[[column,x]]
        df2[column]=df3.groupby(x).transform(lambda group:group.fillna(group.mode()[0]) )
        df[column]=df2[column]
    elif method=='drop': 
        df.dropna(axis=0, inplace=True)
    else:
        raise ValueError('method must be one of mean,median,mode,value,interpolation,KNN,group_mode or drop')

# F8 Function for numerical data analysis 

In [44]:
def numerical_analysis(df,column):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import statsmodels.api as sm
    import pylab as py
    sns.distplot(df[column])
    plt.show()
    sns.boxplot(df[column])
    plt.show()
    sm.qqplot(df[column], line ='s') 
    py.show()
    display(df[column].describe())
    from scipy.stats import skew
    from scipy.stats import kurtosis
    Skew=skew(df[column])
    Kurtosis=kurtosis(df[column])
    print('Skew is',Skew)
    print('Kurtosis is',Kurtosis)
    if Skew>-2 and Skew<2 and Kurtosis>-7 and Kurtosis<7:
        print('Can be considered normally distributed on the basis of skewness and kurtosis')
    else:
        print('Cannot be considered normally distributed on the basis of skewness and kurtosis')
    print('\n')
    print('Shapiro Wilk Test for Normality')
    from scipy.stats import shapiro
    stat,p=shapiro(df[column])
    print('stat=%.3f,p=%.3f\n'%(stat,p))
    if p>0.05:
        print('Normally distributed according to shapiro test')
    else:
        print('Not Normally distributed according to shapiro test')
    print('\n')
    print('Anderson Test for Normality')
    from scipy.stats import anderson
    result=anderson(df[column])
    print('stat=%.3f'%(result.statistic))
    for i in range(len(result.critical_values)):
        sig_level,cric_value=result.significance_level[i],result.critical_values[i]
        if result.statistic<cric_value:
            print(f"Probably Normally distributed {cric_value} critical value at {sig_level} level of significance")
        else:
            print(f"Probably Not Normally distributed {cric_value} critical value at {sig_level} level of significance")
    print('\n')
    print('Chi square Normality test')
    from scipy.stats import chisquare
    stat,p=chisquare(df[column])
    print('stat=%.3f,p=%.3f\n'%(stat,p))
    if p>0.05:
        print('Normally distributed according to chisquare test')
    else:
        print('Not Normally distributed according to chisquare test')
    print('\n')
    print('Lilliefors Test for Normality')
    from statsmodels.stats.diagnostic import lilliefors
    stat,p=lilliefors(df[column])
    print('stat=%.3f,p=%.3f\n'%(stat,p))
    if p>0.05:
        print('Normally distributed according to lilliefors test')
    else:
        print('Not Normally distributed according to lilliefors test')
    print('\n')
    print('Kolmogorov-Smirnov test')
    from scipy.stats import kstest
    stat,p=kstest(df[column],'norm')
    print('stat=%.3f,p=%.3f\n'%(stat,p))
    if p>0.05:
        print('Normally distributed according to Kolmogorov-Smirnov test')
    else:
        print('Not normally distributed according to Kolmogorov-Smirnov test')

# F9 Function for categorical data analysis

In [30]:
def categorical_analysis(df,column):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    print(df[column].value_counts())
    plt.subplots(figsize=(12,6))
    sns.countplot(df[column])

# F10 Function to change the type of any column

In [29]:
def change_type(df,column,t):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    if t=='integer':
        df[column]=pd.to_numeric(df[column],downcast='integer')
    elif t=='float':
        df[column]=pd.to_numeric(df[column],downcast='float')
    elif t=='datetime':
        df[column]=pd.to_datetime(df[column])
    elif t=='object':
        df[column]=df[column].astype('object')
    else:
        raise ValueError('Must be one of integer,float,datetime,object')
        

# F11 Function to change the discretizations of a particular catergorical column

In [43]:
def change_descr(df,column,value=None,new_value=None):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    x=input('Remove white space? Y or N')
    if x=='Y':
        df[column]=df[column].str.replace(' ','')
    elif x=='N':
        df[column]=df[column].replace(value,new_value)
    else:
        raise ValueError('Y or N only')
    y=input('change column name? Y or N')
    if y=='Y':
        w=input('Remove with space? Y or N')
        if w=='N':
            z=input('Enter new column name')
            df.rename(columns={column:z},inplace=True)
        elif w=='Y':
            for i in df.columns:
                
                c=i.replace(' ','')
                df.rename(columns={i:c},inplace=True)
                #df.rename(columns=lambda x: x.strip())
    else:
        pass

# F12 Function for data analysis - extract year, month etc., subtract dates


In [27]:
def date_function(df,column,subtract=False):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    df['year']=df[column].dt.year
    df['month']=df[column].dt.month
    if subtract==True:
        x=input('Enter 2nd column name')
        df['date_diff']=df[column]-df[x]

# F13 Function to make a deep copy of a dataframe

In [26]:
def deep_copy(df):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    df2=df.copy(deep=True)
    return df2

# F14 Function to encode categorical into numerical

In [25]:
def encode(df,*columns):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    cols=[]
    for i in columns:
        cols.append(i)
    encode_type=input('Enter encode type')
    if encode_type=='label':
        from sklearn.preprocessing import LabelEncoder
        le=LabelEncoder()
        for i in cols:
            df[i]=le.fit_transform(df[[i]])
        return df
    elif encode_type=='ordinal':
        for i in cols:

            ord_dict={}
            print(i)
            n=int(input("enter a number of unique values present in the column: "))
            for j in range(n):
                key=input('Unique value for the column')
                value=int(input('Enter ordinal number for the value'))
                ord_dict[key]=value
            x=df[i].map(ord_dict)
            df[i]=x
        return df
    elif encode_type=='one-hot':
        df2=pd.get_dummies(df,columns=cols)
        
        return df2
    else:
        raise Error('Invalid encode type, select one from label,ordinal,one-hot')

# F15 Function to split dataframe into X (predictors) and y (label), apply standard scaling on X, apply the desired ML algorithm and output the results

In [24]:
def fit_model(df,label,algorithm,classification=True):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    algorithms={'Regression','KNN','Naive Bayes','Decision Tree','Random Forest','Gradient Boosting'}
    if algorithm not in algorithms:
        raise ValueError("Algorithm must be one of %r." % valid)
    from sklearn import metrics
    from sklearn.metrics import classification_report
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn import tree
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.naive_bayes import GaussianNB
    from yellowbrick.classifier import ConfusionMatrix
    from sklearn.metrics import auc 
    from sklearn.metrics import log_loss
    from matplotlib import pyplot
    dict_classifiers = {"Regression":LogisticRegression(),"KNN": KNeighborsClassifier(),"Decision Tree": tree.DecisionTreeClassifier(),"Naive Bayes": GaussianNB(), "Random Forest": RandomForestClassifier(),"Gradient Boosting": GradientBoostingClassifier()}
    dict_regression={"Regression":LinearRegression(),"KNN": KNeighborsRegressor(),"Decision Tree": tree.DecisionTreeRegressor(), "Random Forest": RandomForestRegressor(),"Gradient Boosting": GradientBoostingRegressor()}
    x=df.drop([label], axis=1)
    y=df[label]
    from sklearn.preprocessing import StandardScaler
    ss=StandardScaler()
    for i in x.columns:
        x[i] = ss.fit_transform(x[[i]])
    from sklearn.model_selection import train_test_split
    xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
    if classification==True:
        classifier=dict_classifiers[algorithm]
        classifier.fit(xtrain,ytrain)
        y_pred=classifier.predict(xtest)
        print("Accuracy  (%): \n", accuracy_score(ytest, y_pred)*100)
        fpr , tpr, _ = roc_curve(ytest, y_pred)
        print("AUC  (%): \n",auc(fpr, tpr)*100)
        print("Precision: \n",precision_score(ytest, y_pred)*100)
        print("Recall (%): \n",recall_score(ytest, y_pred)*100)
        print("f1 score (%): \n",f1_score(ytest, y_pred)*100)
        print('logistic loss :\n',log_loss(ytest,y_pred))
        print(algorithm + '\n'+classification_report(ytest,y_pred))
        #print(confusion_matrix(ytest, y_pred))
        visualizer = ConfusionMatrix(classifier)
        visualizer.fit(xtrain, ytrain)  
        visualizer.score(xtest,ytest)
        g = visualizer.poof()
        pyplot.plot(fpr, tpr, marker='.')
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        pyplot.show()
    else:
        if algorithm=='Naive Bayes':
            raise Error('Regression cannot be done with Naive Bayes')
        else:
            regressor=dict_regression[algorithm]
            regressor.fit(xtrain,ytrain)
            y_pred=regressor.predict(xtest)
            print('Mean Absolute Error is ',metrics.mean_absolute_error(ytest, y_pred))
            print('Mean Squared Error is ',metrics.mean_squared_error(ytest, y_pred))
            print('The R squared Error is ',metrics.r2_score(ytest,y_pred))
            r2=metrics.r2_score(ytest,y_pred)
            n=len(xtrain)
            k=len(x.columns)
            r2_adj=1-((1-r2)*(n-1)/(n-k-1))
            print('The adjusted R Squared Error is',r2_adj)
            plt.scatter(ytest, y_pred, c = 'green') 
            plt.xlabel("True Value") 
            plt.ylabel("Predicted value") 
            plt.title("True value vs predicted value") 
            plt.show()

In [50]:
def fit_oversample_model(df,label,algorithm,classification=True):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    algorithms={'Regression','KNN','Naive Bayes','Decision Tree','Random Forest','Gradient Boosting'}
    if algorithm not in algorithms:
        raise ValueError("Algorithm must be one of %r." % valid)
    from sklearn import metrics
    from sklearn.metrics import classification_report
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn import tree
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.naive_bayes import GaussianNB
    from yellowbrick.classifier import ConfusionMatrix
    from sklearn.metrics import auc 
    from sklearn.metrics import log_loss
    from matplotlib import pyplot
    dict_classifiers = {"Regression":LogisticRegression(),"KNN": KNeighborsClassifier(),"Decision Tree": tree.DecisionTreeClassifier(),"Naive Bayes": GaussianNB(), "Random Forest": RandomForestClassifier(),"Gradient Boosting": GradientBoostingClassifier()}
    dict_regression={"Regression":LinearRegression(),"KNN": KNeighborsRegressor(),"Decision Tree": tree.DecisionTreeRegressor(), "Random Forest": RandomForestRegressor(),"Gradient Boosting": GradientBoostingRegressor()}
    from sklearn.preprocessing import StandardScaler
    x=df.drop([label], axis=1)
    y=df[label]
    ss=StandardScaler()
    for i in x.columns:
        x[i] = ss.fit_transform(x[[i]])
    from imblearn.over_sampling import RandomOverSampler
    oversample = RandomOverSampler(sampling_strategy=0.5)
    from sklearn.model_selection import train_test_split
    xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
    x_res, y_res = oversample.fit_resample(xtrain, ytrain)
    #from sklearn.model_selection import train_test_split
    #xtrain , xtest , ytrain, ytest =train_test_split(x_res,y_res,test_size=0.20,random_state=42 )
    if classification==True:
        classifier=dict_classifiers[algorithm]
        classifier.fit(x_res, y_res)
        y_pred=classifier.predict(xtest)
        print("Accuracy  (%): \n", accuracy_score(ytest, y_pred)*100)
        fpr , tpr, _ = roc_curve(ytest, y_pred)
        print("AUC  (%): \n",auc(fpr, tpr)*100)
        print("Precision: \n",precision_score(ytest, y_pred)*100)
        print("Recall (%): \n",recall_score(ytest, y_pred)*100)
        print("f1 score (%): \n",f1_score(ytest, y_pred)*100)
        print('logistic loss :\n',log_loss(ytest,y_pred))
        print(algorithm + '\n'+classification_report(ytest,y_pred))
        #print(confusion_matrix(ytest, y_pred))
        visualizer = ConfusionMatrix(classifier)
        visualizer.fit(x_res, y_res)  
        visualizer.score(xtest,ytest)
        g = visualizer.poof()
        pyplot.plot(fpr, tpr, marker='.')
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        pyplot.show()
    else:
        if algorithm=='Naive Bayes':
            raise Error('Regression cannot be done with Naive Bayes')
        else:
            regressor=dict_regression[algorithm]
            regressor.fit(x_res, y_res)
            y_pred=regressor.predict(xtest)
            print('Mean Absolute Error is ',metrics.mean_absolute_error(ytest, y_pred))
            print('Mean Squared Error is ',metrics.mean_squared_error(ytest, y_pred))
            print('The R squared Error is ',metrics.r2_score(ytest,y_pred))
            r2=metrics.r2_score(ytest,y_pred)
            n=len(xtrain)
            k=len(x.columns)
            r2_adj=1-((1-r2)*(n-1)/(n-k-1))
            print('The adjusted R Squared Error is',r2_adj)
            plt.scatter(ytest, y_pred, c = 'green') 
            plt.xlabel("True Value") 
            plt.ylabel("Predicted value") 
            plt.title("True value vs predicted value") 
            plt.show()

In [4]:
def knn_with_n(df,label,n=5,classification=True,over_sample=False,sampling_strategy=0.5):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn import metrics
    from sklearn.metrics import classification_report
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsRegressor
    from yellowbrick.classifier import ConfusionMatrix
    from sklearn.metrics import auc 
    from sklearn.metrics import log_loss
    from matplotlib import pyplot
    from sklearn.preprocessing import StandardScaler
    x=df.drop([label], axis=1)
    y=df[label]
    ss=StandardScaler()
    for i in x.columns:
        x[i] = ss.fit_transform(x[[i]])
    if over_sample==False:
        from sklearn.model_selection import train_test_split
        xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
        
            
    else:
        from imblearn.over_sampling import RandomOverSampler
        oversample = RandomOverSampler(sampling_strategy=sampling_strategy)
        from sklearn.model_selection import train_test_split
        xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
        x_res, y_res = oversample.fit_resample(xtrain, ytrain)
        xtrain=x_res
        ytrain=y_res
    
    if classification==True:
        knn=KNeighborsClassifier(n_neighbors=n)
        knn.fit(xtrain,ytrain)
        y_pred=knn.predict(xtest)
        print("Accuracy  (%): \n", accuracy_score(ytest, y_pred)*100)
        fpr , tpr, _ = roc_curve(ytest, y_pred)
        print("AUC  (%): \n",auc(fpr, tpr)*100)
        print("Precision: \n",precision_score(ytest, y_pred)*100)
        print("Recall (%): \n",recall_score(ytest, y_pred)*100)
        print("f1 score (%): \n",f1_score(ytest, y_pred)*100)
        print('logistic loss :\n',log_loss(ytest,y_pred))
        print(classification_report(ytest,y_pred))
        #print(confusion_matrix(ytest, y_pred))
        visualizer = ConfusionMatrix(knn)
        visualizer.fit(xtrain,ytrain)  
        visualizer.score(xtest,ytest)
        g = visualizer.poof()
        pyplot.plot(fpr, tpr, marker='.')
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        pyplot.show()
    else:
        knn=KNeighborsRegressor(n_neighbors=n)
        regressor.fit(xtrain, ytrain)
        y_pred=regressor.predict(xtest)
        print('Mean Absolute Error is ',metrics.mean_absolute_error(ytest, y_pred))
        print('Mean Squared Error is ',metrics.mean_squared_error(ytest, y_pred))
        print('The R squared Error is ',metrics.r2_score(ytest,y_pred))
        r2=metrics.r2_score(ytest,y_pred)
        n=len(xtrain)
        k=len(x.columns)
        r2_adj=1-((1-r2)*(n-1)/(n-k-1))
        print('The adjusted R Squared Error is',r2_adj)
        plt.scatter(ytest, y_pred, c = 'green') 
        plt.xlabel("True Value") 
        plt.ylabel("Predicted value") 
        plt.title("True value vs predicted value") 
        plt.show()
        

In [3]:
def knn_plot(df,label,classification=True,over_sample=False):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn import metrics
    from sklearn.metrics import classification_report
    from sklearn.metrics import f1_score
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import KNeighborsRegressor
    from yellowbrick.classifier import ConfusionMatrix
    from sklearn.metrics import auc 
    from sklearn.metrics import log_loss
    from matplotlib import pyplot
    from sklearn.preprocessing import StandardScaler
    x=df.drop([label], axis=1)
    y=df[label]
    ss=StandardScaler()
    for i in x.columns:
        x[i] = ss.fit_transform(x[[i]])
    if over_sample==False:
        from sklearn.model_selection import train_test_split
        xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
        
            
    else:
        from imblearn.over_sampling import RandomOverSampler
        oversample = RandomOverSampler(sampling_strategy=0.5)
        from sklearn.model_selection import train_test_split
        xtrain , xtest , ytrain, ytest =train_test_split(x,y,test_size=0.20,random_state=42 )
        x_res, y_res = oversample.fit_resample(xtrain, ytrain)
        xtrain=x_res
        ytrain=y_res
    if classification==True:

        error=[]
        for j in range(1,35):
            knn=KNeighborsClassifier(n_neighbors=j)
            knn.fit(xtrain,ytrain)
            pred_i=knn.predict(xtest)
            error.append(np.mean(pred_i!=ytest))
        plt.figure(figsize=(12,6))
        plt.plot(range(1,35),error,color='red',marker='o',markersize=10)
        plt.title('Error rate K value')
        plt.xlabel('K value')
        plt.ylabel('Mean Error')
    else:
        error=[]
        for j in range(1,35):
            knn=KNeighborsRegressor(n_neighbors=j)
            knn.fit(xtrain,ytrain)
            pred_i=knn.predict(xtest)
            error.append(np.mean(pred_i!=ytest))
        plt.figure(figsize=(12,6))
        plt.plot(range(1,35),error,color='red',marker='o',markersize=10)
        plt.title('Error rate K value')
        plt.xlabel('K value')
        plt.ylabel('Mean Error')

# F16 Function to apply ANOVA and output results

In [23]:
def anova(df,continous,categorical):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import statsmodels.api as sm
    from statsmodels.formula.api import ols
    lm=ols(continous+'~'+categorical,data=df).fit()
    table=sm.stats.anova_lm(lm)
    display(table)

# F17 Function to generate correlation heatmaps

In [22]:
def corr_heatmap(df):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    corr=df.corr()
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.subplots(figsize=(12,12))
    sns.heatmap(corr,annot=True)

# F18

In [21]:
def scatter(df,column1,column2):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    plt.subplots(figsize=(12,12))
    sns.scatterplot(df[column1],df[column2])

In [45]:
def test_dependency(data,f1,f2,alpha):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import scipy.stats as s
    ov=pd.crosstab(data[f1],data[f2])
    b=s.chi2_contingency(ov)
    chi2_statistic=b[0]
    p_value=b[1]
    dof=b[2]
    critical_value=s.chi2.ppf(q=1-alpha, df=dof)
    print('Significance level: ',alpha)
    print('Degree of Freedom: ',dof)
    print('chi-square statistic:',chi2_statistic)
    print('critical_value:',critical_value)
    print('p-value:',p_value)
    if chi2_statistic>=critical_value:
        print("Reject H0,There is a relationship between 2 categorical variables")
    else:
        print("Retain H0,There is no relationship between 2 categorical variables")
    if p_value<=alpha:
        print("Reject H0,There is a relationship between 2 categorical variables")
    else:
        print("Retain H0,There is no relationship between 2 categorical variables")