<a href="https://colab.research.google.com/github/de218031/Python/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
import numpy as np
import pandas as pd

#plotting modules
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#classification modules
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


#import performance scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
warnings.filterwarnings('ignore')

In [3]:
def missing_value_check(dataset):
    # columns that have null values greater than 5% -> It gives percentage
    percent_missing = dataset.isnull().sum() * 100 / len(dataset)
    # no of missing
    num_missing = dataset.isnull().sum()
    # sorting values in descending order
    percent_missing = percent_missing.sort_values(ascending=False)
    # inserting in percentages into new dataframe
    missing_value_df = pd.DataFrame({'percent_missing': percent_missing, 'Num_missing_val': num_missing})
    # Sorting values in ascending order
    missing_value_df = missing_value_df.sort_values(by=['percent_missing'], ascending=False)

    return missing_value_df, percent_missing


# create a function to find outliers using IQR
def detecting_outliers(dataset):
    q1 = dataset.quantile(0.25)
    q3 = dataset.quantile(0.75)
    IQR = q3 - q1
    upperlimit = dataset < (q1 - 1.5 * IQR)
    lowerlimit = dataset > (q3 + 1.5 * IQR)
    outliers = dataset[upperlimit | lowerlimit]

    return outliers, upperlimit, lowerlimit


# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr


In [1]:
# Validation metrics for classification
def validationmetrics(model, testX, testY, verbose=True):
    predictions = model.predict(testX)

    # Accuracy
    accuracy = accuracy_score(testY, predictions) * 100

    # Precision
    precision = precision_score(testY, predictions, pos_label='positive', average='micro') * 100

    # Recall
    recall = recall_score(testY, predictions, pos_label='positive', average='micro') * 100

    # F-Score
    f_score = f1_score(testY, predictions, average="micro")

    if verbose:
        print("\n Accuracy: \n", accuracy)
        print("\n Precision of event Happening: \n", precision)
        print("\n Recall of event Happening: \n", recall)
        print("\n F-Score:\n", f_score)

        # confusion Matrix
        print("\n Confusion Matrix: \n", confusion_matrix(testY, predictions))
        conf_mat = confusion_matrix(testY, predictions)
        f, ax = plt.subplots(figsize=(5, 5))
        sns.heatmap(conf_mat, annot=True, linewidths=0.7, linecolor="red", fmt=".0f", ax=ax)
        plt.xlabel("y_pred")
        plt.ylabel("y_true")
        plt.show()

    res_map = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f_score": f_score,
        "model_obj": model
    }
    return res_map

In [9]:
from google.colab import files
df = files.upload()



Saving season-1516.csv to season-1516.csv


In [10]:
import io
df = pd.read_csv(io.StringIO(df['season-1516.csv'].decode('utf-8')))

In [11]:
df

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,SP1,21/08/2015,Malaga,Sevilla,0,0,D,0,0,D,...,1.92,24,0.25,2.02,1.95,1.94,1.90,3.22,3.50,2.36
1,SP1,22/08/2015,Ath Madrid,Las Palmas,1,0,H,1,0,H,...,1.96,26,-1.75,2.02,1.95,1.95,1.89,1.25,6.24,16.90
2,SP1,22/08/2015,Espanol,Getafe,1,0,H,1,0,H,...,1.56,27,-0.50,1.95,1.90,2.01,1.94,2.10,3.28,4.19
3,SP1,22/08/2015,La Coruna,Sociedad,0,0,D,0,0,D,...,1.56,26,0.00,1.82,1.78,2.15,2.07,2.44,3.41,3.15
4,SP1,22/08/2015,Vallecano,Valencia,0,0,D,0,0,D,...,2.04,27,0.50,1.90,1.83,2.08,2.01,2.93,3.68,2.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,SP1,15/05/2016,Betis,Getafe,2,1,H,0,0,D,...,2.16,30,0.75,2.01,1.96,1.95,1.90,3.40,3.86,2.14
376,SP1,15/05/2016,Espanol,Eibar,4,2,H,2,0,H,...,2.12,29,-0.50,2.06,1.99,1.91,1.87,2.00,3.76,3.95
377,SP1,15/05/2016,Malaga,Las Palmas,4,1,H,2,1,H,...,2.02,29,-0.75,2.12,2.07,1.85,1.80,1.67,4.20,5.44
378,SP1,15/05/2016,Sp Gijon,Villarreal,2,0,H,1,0,H,...,1.98,30,-1.25,2.09,2.03,1.87,1.83,1.48,5.08,6.89
