# Handbook for scripting

<h1> Table of Contents </h1>

1. [Introduction](#Intro)
2. [Importing the packages](#packages)
3. [Import the data set](#Data_set)
4. [Perform Descriptive Statistics on the dataset](#EDA)
5. [Handling missing values](#Missing_values)     
6. [Selecting the columns](#Select_columns)
7. [Example Snippets](#snippets)
8. [Machine Learning Models](#ML)
    - 8.1 [Generic functions](#Generic_functions)
    - 8.2 [Linear Regression](#Linear_Regression)
    - 8.3 [Logistic Regression](#Logistic_Regression)
    - 8.4 [Decision Tree - Classification](#RDT)
    - 8.5 [Decision Tree - Regression](#RDTR)
    - 8.6.[Random Forest_Classification](#Radom_Forest_Class)
    - 8.7.[Random Forest_Regression](#Radom_Forest_Reg)
    - 8.8 [K-Means](#K-Means)


## 1. Introduction <a id="Intro">
    The main intention of this document is to help the user when he performing the ML activities on the datasets

## 2. Importing the packages  <a id='packages'>

In [1]:
# Import Required Packages 
import pandas as pd
import numpy as np
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn 
 
from sklearn import metrics

In [None]:
# to remove teh printing barrier
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# TO print multiple outputs in single line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 3. Import the data set <a id="Data_set">

In [None]:
#Approach by using path 
your_local_path=r""
train_df = pd.read_csv(your_local_path+"titanic.csv")

#Approach by uploading the fiel to jupyter using read_csv
titanic_data = pd.read_csv("attachment_titanic_lyst9961.csv")

#Approach from url
url = 'https://raw.githubusercontent.com/upxacademy/ML_with_Python/master/Datasets/bikeshare.csv?token=AYxzdiGnjM610dBT7PuwUnUNOmm3bGcvks5ZFDyLwA%3D%3D'
bikes = pd.read_csv(url, index_col='datetime', parse_dates=True)

# Loading data from arff files
from scipy.io.arff import loadarff
phishing_data_raw = loadarff("PhishingData.arff")



## Import the DataSet in colabs
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')


# instat import of teh small 
# from google.colab import files
# upload = files.upload()


# import the dataset in azure notebooks
from azureml import Workspace
ws = Workspace()
ds = ws.datasets['bidata_Azure_all_data.csv']
frame = ds.to_dataframe()

import io
emp_data = pd.read_csv(io.BytesIO(upload['attachment_train_lyst4523.csv']))

In [None]:
# report = pandas_profiling.ProfileReport(flight_data) 
# report.to_file(output_file="Flight_dataset_Prandas_profiing.html")

In [None]:
titanic_data.head()

In [None]:
titanic_data.tail()

In [None]:
titanic_data.sample()

## 4. Perform Descriptive Statistics on the dataset <a id="EDA">

In [1]:
titanic_data.info()

NameError: name 'bank_data' is not defined

In [None]:
titanic_data.shape

In [None]:
titanic_data.describe()

In [None]:
    df =
    for col in df.columns:
        print("=============== " + str(col) +" Start===============")
        print("\nUnique values :: " + str(df[col].nunique()))
        print("\nMissing values Count:: " + str(df[col].isnull().sum()))
        print("\nUnique Values Count:: \n" + str(df[col].value_counts()))
        print("=============== " + str(col) +" End===============")


In [None]:
#dropping the duplicates 
final_df.drop_duplicates(subset =['Company Code', 'Document Nr','Supplier Nr','Item'], 
                     keep = False, inplace = True) 

In [None]:
#custom designed function
def var_summary(df):
    '''
    To different statistical metrics from a given data set
    '''
    index_df = ["Count", 'Sum','Unique_count', "Mean", "Median", "Std", "Min",  "P1", "P5", "P10", "P25", "P50", "P75", "P90", "P95", "P99", "P99.5", "Max"]
    values = [df.count(), df.sum(),df.nunique(), df.mean(), df.median(), df.std(), df.min(),  df.quantile(0.01, axis = 0) , df.quantile(0.05, axis = 0),
              df.quantile(0.1, axis = 0),df.quantile(0.25, axis = 0), df.quantile(0.5, axis = 0), df.quantile(0.75, axis = 0), df.quantile(0.9, axis = 0), df.quantile(0.95, axis = 0),
              df.quantile(0.99, axis = 0), df.quantile(0.995, axis = 0), df.max() ]    
    new_df = pd.DataFrame(values, index = index_df, columns = df.columns)
    return new_df

In [None]:
var_summary(titanic_data)

In [None]:
df.index = np.arange(1, len(df) + 1)

## 5. Handling missing outliers and  values <a id="Missing_values">

In [None]:
# finding null values
titanic_data.isnull().sum()
titanic_test_data.isnull().sum()

In [None]:
def missing_values(df, percentage):
  '''
  This function is to drop the columns which are having missing values more than the percentage passed
  '''
  columns = df.columns
  percent_missing = df.isnull().sum() * 100 / len(df)
  missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
  missing_drop = list(missing_value_df[missing_value_df.percent_missing>percentage].columns)
  df = df.drop(missing_drop, axis=1)  
  return df

In [None]:
thresh = len(bidata) * .2
bidata.dropna(thresh = thresh, axis = 1, inplace = True)

In [None]:
# using interpolate function
titanic_data['Age'] = titanic_data['Age'].interpolate()
# using mean
titanic_data['Age'] = titanic_data['Age'].mean()
# using median
titanic_data['Age'] = titanic_data['Age'].mean()
#using mode
titanic_data['Age'] = titanic_data['Age'].mode()

In [None]:
# Filling the missing values with the mean of the columns

from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(features)
X

## Plots

In [1]:
sns.distplot(train_df['trip_duration'], hist=False, color="g", kde_kws={"shade": True})


NameError: name 'sns' is not defined

In [None]:
spectf_df.hist(bins=10, figsize=(12,30), layout=(9,5))

## n.Categorical to Numerical <a id="C2N">

In [None]:
#Approach 1
y_dummies = pd.get_dummies(titanic_data.y, prefix='y', drop_first= False)
titanic_data = pd.concat([titanic_data, y_dummies], axis=1)

In [None]:
nerw_df =  pd.merge(final_df_1, outcome[['Suspicious','Company Code', 'Document Nr','Supplier Nr','Item']], how="left", on=['Company Code', 'Document Nr','Supplier Nr','Item'])

In [None]:
# For multple columns


In [None]:
# Dummification on all he fields
for col in bidata_cate.columns:
    y_dummies = pd.get_dummies(bidata[col], prefix=col, drop_first= False)
    bidata = pd.concat([bidata, y_dummies], axis=1)

In [None]:
# dummification by encoding 
from  sklearn.preprocessing import LabelEncoder
encoding_list = ['job','marital','education','default','housing','loan','contact','month','poutcome','y']
bank_data[encoding_list] = bank_data[encoding_list].apply(LabelEncoder().fit_transform)

In [None]:
#Another typr of encoding
for feature in combined_set.columns: # Loop through all columns in the dataframe
    if combined_set[feature].dtype == 'object': # Only apply for columns with categorical strings
        combined_set[feature] = pd.Categorical(combined_set[feature]).codes

## 6. Selecting the columns <a id="Select_columns">

### 1. Correlation <a id="correlation">

In [None]:
flight_data_corr=flight_data.drop([]).corr(method='pearson')



https://datascience.stackexchange.com/questions/64260/pearson-vs-spearman-vs-kendall



Parameters
method{‘pearson’, ‘kendall’, ‘spearman’} or callable
Method of correlation:

pearson : standard correlation coefficient

kendall : Kendall Tau correlation coefficient

spearman : Spearman rank correlation

callable: callable with input two 1d ndarrays
and returning a float. Note that the returned matrix from corr will have 1 along the diagonals and will be symmetric regardless of the callable’s behavior.

min_periodsint, optional
Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation.

Returns
DataFrame
Correlation matrix.

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(flight_data_corr,annot=True, cmap="YlGnBu")

In [None]:
# custom designed function
flight_data_corr=flight_data.corr()
plt.figure(figsize=(20,20))
sns.heatmap(flight_data_corr,annot=True, cmap="YlGnBu")

def extract_x_columns(df_y):
    X = df_y[df_y.between(0.1,0.8)].index        
    X = X.append(df_y[df_y.between(-0.8,-0.1)].index)    
    return X

x_columns = extract_x_columns(flight_data_corr)
x_columns

In [None]:
x_columns = extract_x_columns(flight_data_corr)
x_columns

### Anova code 

### 2. StandardScalar 

In [None]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
spectf_test_nor = scalar.fit_transform(spectf_test)
spectf_df_nor = scalar.fit_transform(spectf_df)

 ### 3. MinMaxScaler- Normilzed Scaling form 0-1

In [1]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(data)



NameError: name 'x' is not defined

## 7. Example code sippnets <a id="snippets">

### 1. Using lamda function


In [None]:
titanic_data["Sex"]=  titanic_data["Sex"].apply(lambda x : 0 if x=='male' else 1)

In [None]:
#repalce 
kidney_df[['htn','dm','cad','pe','ane']] = kidney_df[['htn','dm','cad','pe','ane']].replace(to_replace={'yes':1,'no':0})

In [None]:
# to separate numerical and categorical 
nasa_data_numerical = nasa_data.select_dtypes(include=['float64','int64']) 
nasa_data_cate = nasa_data.select_dtypes(exclude=['float64','int64']) 


In [None]:
# tp onvert the data from bytes to stings 
Aut_data_df.select_dtypes(include=['object']).stack().str.decode("utf-8").unstack()

In [None]:
#resetting the index
df.index = np.arange(1, len(df) + 1)

In [None]:
final_df['ICCC Results'] = np.where(final_df['Total Score']>=15, 1,0)

In [None]:
import datetime
from datetime import timedelta


def time_diff(date1, date2):
    '''
    Input time should be of timstamp format
    '''
    datetimeFormat = '%Y-%m-%d %H:%M:%S.%f'    
#     diff = datetime.datetime.strptime(date1, datetimeFormat)\
#         - datetime.datetime.strptime(date2, datetimeFormat)

    diff = date1-date2
    return diff.days

## SMOTE

In [None]:
unique, count = np.unique(Y, return_counts=True)
y_train_dict_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_dict_value_count

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
pipe = make_pipeline(
    SMOTE(),
    LogisticRegression()
)

weights = np.linspace(0.005, 0.25, 10)
from sklearn.model_selection import GridSearchCV
gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__ratio': weights
    },
    scoring='f1',
    cv=3
)
grid_result = gsc.fit(x_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)
weight_f1_score_df = pd.DataFrame({ 'score': grid_result.cv_results_['mean_test_score'],
                                   'weight': weights })
weight_f1_score_df.plot(x='weight')

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_smote, y_train_smote = sm.fit_sample(x_train, y_train)
x_test_smote, y_test_smote = sm.fit_sample(x_test, y_test)
# x_smote, y_smote = sm.fit_sample(X, y_train)



from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
sm = SMOTE(random_state=12, sampling_strategy=0.3)
x_smote, y_smote = sm.fit_sample(X_train_minmax, Y)
# x_smote, y_smote = sm.fit_sample(X, y_tra

## 8. Machine Learning Models <a id="ML">

### 1. Generic functions <a id="Generic_functions">

In [None]:
# define X and Y columns
X= titanic_data[X_columns]
Y= titanic_data["Survived"]

# Approach 2
Y= titanic_data["Survived"]
X = titanic_data.drop(['default', 'student'], axis=1, inplace=True)

In [None]:
# Train and test split

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=62, stratify=Y)
x_train.shape
x_test.shape
y_train.shape
y_test.shape

### 2. Linear Regression <a id="Linear_Regression">

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [None]:
lm.fit(x_train, y_train)

In [None]:
predicted = lm.predict(x_test)
predicted.shape

In [None]:
metrics.mean_squared_error(y_test, predicted)
metrics.mean_absolute_error(y_test,predicted)

In [None]:
lm.coef_

In [None]:
lm.intercept_

## 3. Logistic Regression <a id="Logistic_Regression">

In [None]:
# Building logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression(C=0.6)

In [None]:
logreg.fit(x_train, y_train)

In [None]:
# now applying our learnt model on test and also on train data

y_pred_test = logreg.predict(x_test)
y_pred_train = logreg.predict(x_train)

In [None]:
# comparing the metrics of predicted lebel and real label of test data
print("Test Accuracy: ", metrics.accuracy_score(y_test, y_pred_test))

In [None]:
# comparing the metrics of predicted lebel and real label of test data
print("Train Accuracy: ", metrics.accuracy_score(y_train, y_pred_train))

In [None]:
# creating a confusion matrix to understand the classification
conf = metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
print(conf)

In [None]:
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(conf,cmap = cmap,xticklabels=['Prediction No','Prediction Yes'],yticklabels=['Actual No','Actual Yes'], annot=True,
            fmt='d')

In [None]:
# Creating Classification Report

cr = metrics.classification_report(y_test, y_pred_test)
print(cr)

### 4. Decision Tree - Classification<a id="RDT">

In [None]:
# Importing the packages for Decision Tree Classifier


from sklearn import tree
my_tree_one = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state=42)
my_tree_one

In [None]:
# Fit the decision tree model on your features and label

my_tree_one = my_tree_one.fit(X, y)

In [None]:
# The feature_importances_ attribute make it simple to interpret the significance of the predictors you include

list(zip(columns,my_tree_one.feature_importances_))

In [None]:
# The accuracy of the model

print("Score of the model:: " + my_tree_one.score(X, y))

In [None]:
# Visualize the decision tree graph

with open('tree.dot','w') as dotfile:
    tree.export_graphviz(my_tree_one, out_file=dotfile, feature_names=columns, filled=True)
    dotfile.close()
    
# You may have to install graphviz package using 
# conda install graphviz
# conda install python-graphviz

from graphviz import Source

with open('tree.dot','r') as f:
    text=f.read()
    plot=Source(text)
plot   

In [None]:
#Print Confusion matrix on Train Data
from sklearn import metrics

pred = my_tree_one.predict(X)
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion

In [None]:
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(df_confusion,cmap = cmap,xticklabels=['Prediction No','Prediction Yes'],yticklabels=['Actual No','Actual Yes'], annot=True,
            fmt='d')

### 4. Random Decision Tree - Regression<a id="RDTR">

In [None]:
# Importing the packages for Decision Tree Regression

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(max_depth = 10, n_estimators = 100, random_state = 1)

In [None]:
# Fit the decision tree model on your features and label
my_tree_one = my_tree_one.fit(X, y)

In [None]:
# The feature_importances_ attribute make it simple to interpret the significance of the predictors you include
list(zip(columns,my_tree_one.feature_importances_))

In [None]:
# The accuracy of the model
print("Score of the model:: " + my_tree_one.score(X, y))

In [None]:
# Visualize the decision tree graph

with open('tree.dot','w') as dotfile:
    tree.export_graphviz(my_tree_one, out_file=dotfile, feature_names=columns, filled=True)
    dotfile.close()
    
# You may have to install graphviz package using 
# conda install graphviz
# conda install python-graphviz

from graphviz import Source

with open('tree.dot','r') as f:
    text=f.read()
    plot=Source(text)
plot   

### 5.Random Forest <a id="Radom_Forest">

### 8.6.Random Forest_Regression<a id="Radom_Forest_Reg">

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(max_depth=10, n_estimators=100, random_state=45)

In [None]:
rfr = rfr.fit(X,Y)

In [None]:
rfr.score(X,Y)

### 8.7.Random Forest_Classification<a id="Radom_Forest_Class">

In [1]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(max_depth = 10, n_estimators = 100, random_state = 1)

In [2]:
# Fitting the model on Train Data

my_forest = forest.fit(X, y)

NameError: name 'X' is not defined

In [None]:
# Print the accuracy score of the fitted random forest

print(my_forest.score(X, y))

In [None]:
# Making predictions

pred = my_forest.predict(X)

In [None]:
list(zip(columns,my_forest.feature_importances_))

In [None]:
df_confusion_rf = metrics.confusion_matrix(y, pred)
df_confusion_rf

In [None]:
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.heatmap(df_confusion_rf, cmap = cmap,xticklabels=['Prediction No','Prediction Yes'],yticklabels=['Actual No','Actual Yes'], annot=True,
            fmt='d')

### 8.8  K-Means<a id="K-Means">

### 8.9 Naive Bayes Model <a id="NBM">

In [None]:
# Train Naive Bayes Classifier


from sklearn import naive_bayes
clf = naive_bayes.MultinomialNB()

In [None]:
model=clf.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

## 9. Metrics

In [None]:
def generate_model_report(y_actual, y_predicted):
    print("Accuracy = " , accuracy_score(y_actual, y_predicted))
    print("Precision = " ,precision_score(y_actual, y_predicted))
    print("Recall = " ,recall_score(y_actual, y_predicted))
    print("F1 Score = " ,f1_score(y_actual, y_predicted))
    

In [None]:
def generate_auc_roc_curve(clf, X_test):
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(Y_test,  y_pred_proba)
    auc = roc_auc_score(Y_test, y_pred_proba)
    plt.plot(fpr,tpr,label="AUC ROC Curve with Area Under the curve ="+str(auc))
    plt.legend(loc=4)
    plt.show()
    

In [4]:
weights = np.linspace(0.05, 0.95, 20)
print([{0: x, 1: 1.0-x} for x in weights])

[{0: 0.05, 1: 0.95}, {0: 0.09736842105263158, 1: 0.9026315789473685}, {0: 0.14473684210526316, 1: 0.8552631578947368}, {0: 0.19210526315789472, 1: 0.8078947368421052}, {0: 0.23947368421052628, 1: 0.7605263157894737}, {0: 0.28684210526315784, 1: 0.7131578947368422}, {0: 0.33421052631578946, 1: 0.6657894736842105}, {0: 0.381578947368421, 1: 0.618421052631579}, {0: 0.4289473684210526, 1: 0.5710526315789475}, {0: 0.47631578947368414, 1: 0.5236842105263159}, {0: 0.5236842105263158, 1: 0.47631578947368425}, {0: 0.5710526315789474, 1: 0.42894736842105263}, {0: 0.618421052631579, 1: 0.381578947368421}, {0: 0.6657894736842105, 1: 0.3342105263157895}, {0: 0.7131578947368421, 1: 0.2868421052631579}, {0: 0.7605263157894736, 1: 0.2394736842105264}, {0: 0.8078947368421052, 1: 0.19210526315789478}, {0: 0.8552631578947368, 1: 0.14473684210526316}, {0: 0.9026315789473683, 1: 0.09736842105263166}, {0: 0.95, 1: 0.050000000000000044}]
