In [None]:
%reset -f

# AIML CA1

## Import General Dependencies

In [None]:
# Mathematical Dependencies
import numpy as np

# Data Manipulation Dependencies
import pandas as pd

# Graphing Dependencies
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Dependencies
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.pipeline import Pipeline

# Miscellaneous Dependencies
from typing import Callable, Dict # static typing

# Utility Functions
from utils.extraction import extract_attributes

In [None]:
%matplotlib inline

## Utility Functions

## Part I

### Import Exclusive Dependencies

In [None]:
# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Classification Metrics
from sklearn.metrics import confusion_matrix

### Import Data

In [None]:
# Extract raw content of ./data/agaricus-lepiota.names file
metadata: str
with open('./data/agaricus-lepiota.names') as f:
    metadata = f.read()

# Extract attributes from metadata
attrs = extract_attributes(metadata, r'7\. Attribute Information:.*\n((.|\n)*)8\. Missing')

# Extract column names to be used for dataframe
cols = attrs.keys()

In [None]:
# Create the dataframe from ./data/agaricus-lepiota.data file,
#   using column names derived from ./data/agaricus-lepiota.names file
df = pd.read_csv(
    filepath_or_buffer='./data/agaricus-lepiota.data',
    sep=',',
    header=0,
    names=cols
)

# Expand attribute codes to their full definitions
for col in cols:
    df[col].replace(to_replace=attrs[col] ,inplace=True)

#### Inspect Data

In [None]:
# Inspect top 10 rows of the dataset
df.head(n=10)

#### Summarize Data

In [None]:
# Inspect overview of the dataset
df.info()

In [None]:
# Inspect statistics of the dataset
df.describe().transpose()

### Pre-Processing

#### EDA

In [None]:
# Check for missing values
df.isna().sum(axis=0)

In [None]:
df_unique = df.describe().transpose()['unique']
df_unique[df_unique < 2]

In [None]:
df.drop(labels='veil-type', inplace=True, axis=1)

In [None]:
df_ohe = pd.get_dummies(data=df, drop_first=True)
df_ohe.corr()['class_poisonous'].sort_values(key=lambda x: np.abs(x), ascending=False)

In [None]:
# # Check correlation between attributes
# with pd.option_context('display.max_rows', None):
#     yoyo = pd.get_dummies(df, drop_first=True)
#     df_corr = yoyo.corr()
#     df_corr_targ = df_corr
#     df_minor_mask = df_corr_targ.apply(func=lambda s: np.abs(s['class_poisonous']) > 0.5)
#     df_corr_sort = df_corr_targ[df_minor_mask].sort_values(by='class_poisonous', key=lambda s: -np.abs(s))
#     # df_corr_sort.drop(labels='class_poisonous', inplace=True)
#     targeted_corr = df_corr.loc[df_corr_sort.index.values, df_corr_sort.index.values]
#     sns.pairplot(targeted_corr, hue='class_poisonous', diag_kind=None)

In [None]:
# chi square
# from sklearn.feature_selection import chi2, SelectKBest

# # t = pd.get_dummies(df, drop_first=True).groupby(by='class_poisonous').sum().astype(int)
# t = pd.get_dummies(df).drop(labels='class_edible', axis=1)

# s = SelectKBest(score_func=chi2, k=5).fit(t.drop(labels='class_poisonous', axis=1), t[['class_poisonous']])
# good_preds = t.drop(labels='class_poisonous', axis=1).columns.values[s.get_support()]
# sns.pairplot(data=t.drop(labels=t.columns.values[t.columns.values]))

In [None]:
dfdf = pd.get_dummies(df, drop_first=True)

In [None]:
from utils.plotting import format_label
def plot_A(df: pd.DataFrame):
    ax = sns.countplot(data=df, x='class', palette='deep')
    ax.set_ylim(top=5000)
    ax.set_title(label='General Data Distribution')
    ax.set_ylabel(ylabel='Number of Records')
    ax.set_yticklabels(labels=format_label(
        ax.get_yticks() / 1000, lambda s: f'{round(s)}k'))
    ax.set_xlabel(xlabel='Type')
    total_count = df.shape[0]
    for p in ax.patches:
        x = p.get_x()
        y = p.get_height()
        ax.annotate(text=f'{y} ({y/total_count*100:.1f}%)',
                    xy=(x + 0.21, y + 70))
    return ax
ax_a = plot_A(df=df)
ax_a

### Data Partitioning

In [None]:
X = df_ohe.drop(labels='class_poisonous', axis=1)
y = df_ohe['class_poisonous']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model Training

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)

knn.fit(X_train, y_train)
knn.score(X_test, y_test)
# confusion_matrix(y_test, knn.predict(X_test))
# knn.predict(X_test.iloc[0:1,:])
# print(X_test.shape)

In [None]:
neighbours_range = list(range(1, 20))
# fig, ax = plt.subplots(nrows=1, ncols=len(neighbours_range), sharey=True)
result = np.empty(shape=(0,))
for neighbours in neighbours_range:
    cv = cross_val_score(estimator=KNeighborsClassifier(n_neighbors=neighbours), X=X, y=y, cv=8)
    # print(f"Neighbours: {neighbours}\t| Mean: {cv.mean()}\t| Median: {np.median(cv)}")
    result = np.hstack((result, np.array([cv.mean()])))
    # sns.swarmplot(y=cv, ax=ax[neighbours - 3])

sns.lineplot(x=neighbours_range, y=result)
print(result)
# fig.show()

In [None]:
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.base import BaseEstimator

# class DummyEstimator(BaseEstimator):
#     def fit(self): pass
#     def score(self): pass

# pipeline = Pipeline(steps=[
#     ('scaler', StandardScaler()),
#     ('clf', DummyEstimator())
# ])

# params = [
#     {
#         'clf': [KNeighborsClassifier()],
#         'clf__n_neighbors': np.arange(start=4, stop=10)
#     },
#     {
#         'clf': [LogisticRegression(solver='newton-cg')],
#         'clf__C': np.logspace(-1, 2, 3)
#     },
#     {
#         'clf': [GaussianNB()]
#     },
#     {
#         'clf': [SVC()],
#         'clf__C': np.logspace(-1, 2, 3)
#     },
#     {
#         'clf': [DecisionTreeClassifier()],
#         'clf__max_depth': [10, 20, 30]
#     }
# ]

# cv = GridSearchCV(estimator=pipeline, param_grid=params, cv=5)
# cv.fit(X=X, y=y)
# import pickle
# pickle.dump(obj=cv, file=open("./models/grid_search_clf.p", "wb"))
# print(cv.best_params_)
# print(cv.best_score_)
# print(cv.best_estimator_)
# pd.DataFrame(data=cv.cv_results_)

In [None]:
import pickle
mod = pickle.load(file=open('./models/grid_search_clf.p'))
mod.cv_results_

## Part II

### Import Exclusive Dependencies

In [None]:
# Machine Learning Models (Regression)
from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso, Ridge, ElasticNet

### Import Data

In [None]:
# 
df2 = pd.read_csv('./data/kc_house_data.csv')

#### Inspect Data

In [None]:
df2.head()

In [None]:
df2.corr()

#### Summarize Data

In [None]:
df2.info()

In [None]:
df2.describe().transpose().round(2)

### Pre-Processing

#### EDA

In [None]:
# Check for missing values
df2.isna().sum(axis=0)

In [None]:
sns.heatmap(data=df2.corr(), cmap='RdBu', vmin=-1, vmax=1)

In [None]:
df2.corr()['price'].sort_values(key=lambda x: np.abs(x), ascending=False).drop(['lat', 'long', 'price'])

In [None]:
print(pd.unique(df2['id']).size, df2.count()['id'])
df2.drop(labels='id', axis=1, inplace=True)

In [None]:
print(pd.unique(df2['zipcode']).size, df2.count()['zipcode'])
sns.relplot(data=df2, x='zipcode', y='price')
df2.drop(labels='zipcode', axis=1, inplace=True)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=4)
d = np.array(ax).reshape((-1))
for i, t in enumerate(['waterfront', 'floors', 'yr_renovated', 'sqft_lot', 'sqft_lot15', 'yr_built', 'condition']):
    sns.scatterplot(data=df2, x=t, y='price', ax=d[i])

In [None]:
df3 = df2.drop(labels=['waterfront', 'floors', 'yr_renovated', 'yr_built', 'condition'])

In [None]:
def plot_B(df: pd.DataFrame):
    top_features = df.corr()['price'].sort_values(key=lambda x: np.abs(x), ascending=False).drop(['lat', 'long', 'price'])[:9].index.values
    fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(8, 6))
    axs = np.array(ax).reshape((-1))
    for i, x in enumerate(top_features):
        sns.scatterplot(data=df, x=x, y='price', ax=axs[i])
    return fig
f = plot_B(df2)
f

In [None]:
df2['date'] = pd.to_datetime(arg=df2['date'], yearfirst=True)

In [None]:
sns.scatterplot(data=df2, x='sqft_living', y='price')

In [None]:
sns.lineplot(data=df2, x='grade', y='price')
sns.scatterplot(data=df2, x='grade', y='price')


In [None]:
sns.boxplot(data=df2, x='bedrooms', y='price')

In [None]:
sns.boxplot(data=df_tmp, x='bedrooms', y='price')

In [None]:
# Check for outliers
outliers2 = df2[df2['']]

#### Feature Engineering

#### Feature Selection

In [None]:
df2['date']

In [None]:
df2_corr = df2.corr()
df2_corr.drop(labels=df2_corr.columns[df2_corr.columns != 'price'].values, axis=1).drop(labels='price', axis=0)['price'].sort_values(ascending=False)

### Data Partitioning

In [None]:
X2 = df2[['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'waterfront']]
y2 = df2['price']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2)
from sklearn.preprocessing import RobustScaler, normalize
#! delete !#
pip = Pipeline(steps=[
    ('scaler', normalize),
    ('linreg', LinearRegression())
])
model2 = LinearRegression()
pip.fit(X=X2_train, y=y2_train)
print(pip.score(X2_train, y2_train))
print(pip.score(X2_test, y2_test))

### Model Training

In [None]:
model2 = LinearRegression()
model2.fit(X=X2_train, y=y2_train)
print(model2.score(X2_train, y2_train))
print(model2.score(X2_test, y2_test))

In [None]:
model2 = BayesianRidge()
model2.fit(X=X2_train, y=y2_train)
print(model2.score(X2_train, y2_train))
print(model2.score(X2_test, y2_test))

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
u = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reg', DummyEstimator())
])
for m in [Ridge, Lasso, ElasticNet]:
    p = m(alpha=0.5)
    try:
        p.fit(X2_train, y2_train)
        print(p.score(X2_test, y2_test))
    except Exception:
        pass

### Model Scoring

### Model Evaluation

## Conclusions