In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import random 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df_old = pd.read_csv('/content/gdrive/My Drive/UPX Certificate/ML-Atmospheric data.csv')
df_old.head()

In [None]:
df = df_old.drop('Unnamed: 0', axis = 1)
df.head()

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

## Exploratory Data Analysis

**Since there are no null values lets do some EDA**

In [None]:
# Correlation Coefficient Matrix => Train Dataset

corr = df.corr()
plt.figure(figsize = (15,10))
sns.heatmap(corr, cmap = 'YlGnBu', annot = True, linewidths = 0.5);

In [None]:
df['class'].value_counts()

In [None]:
sns.countplot(x = 'class', palette = 'GnBu_d', data = df);

**Not so much of imbalanced data, however it is still cannot be said a balanced one as the classes differ by half.**

In [None]:
plt.figure(figsize = (10,8))
sns.barplot(palette = 'BrBG', data = df);

In [None]:
plt.figure(figsize = (20,15))
sns.boxplot(data = df);

**There are outliers in 6 out of 10 variables.**

In [None]:
plt.figure(figsize = (20,15))
sns.pairplot(data=df);

In [None]:
plt.figure(figsize = (20,15))
df_n = df.replace({'g':1,'h':0})
sns.heatmap(df_n.corr(), xticklabels = df_n.columns, yticklabels = df_n.columns, annot=True, linewidths=0.5);

In [None]:
sns.countplot(x = 'class', palette = 'GnBu_d', data = df_n);

In [None]:
sns.boxplot(x = 'class', y = 'fLength', data = df_n);

In [None]:
sns.boxplot(x = 'class', y = 'fWidth', data = df_n);

In [None]:
sns.boxplot(x = 'class', y = 'fSize', data = df_n);

## Splitting training dataset into train and test

In [None]:
X = df_n.copy().drop(['class'], axis = 1)
y = df_n['class']

In [None]:
# Splitting training dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

**Scaling the data**

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc_fit = sc.fit(X_train)
X_train = sc_fit.transform(X_train)
X_test = sc_fit.transform(X_test)

# 1. Logistic Regression

In [None]:
#Logistice regression model
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

**Predictions**

In [None]:
y_log_pred_test = logreg.predict(X_test)
y_log_pred_train = logreg.predict(X_train)

**Train and Test accuracy**

In [None]:
print(metrics.accuracy_score(y_test, y_log_pred_test))#test accuracy
print(metrics.accuracy_score(y_train, y_log_pred_train))#train accuracy

# 2. Random Forest Classification

In [None]:
#Random Forest model
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(X_train, y_train)

**Predictions**

In [None]:
y_forest_pred_test = forest.predict(X_test)
y_forest_pred_train = forest.predict(X_train)

**Train and Test accuracy**

In [None]:
print(metrics.accuracy_score(y_test, y_forest_pred_test))#test accuracy
print(metrics.accuracy_score(y_train, y_forest_pred_train))#train accuracy

# 3. SVC

In [None]:
#Support Vector model
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)

**Predictions**

In [None]:
y_svc_pred_test = svc.predict(X_test)
y_svc_pred_train = svc.predict(X_train)

**Train and Test accuracy**

In [None]:
print(metrics.accuracy_score(y_test, y_svc_pred_test))#test accuracy
print(metrics.accuracy_score(y_train, y_svc_pred_train))#train accuracy