<img align='centre' src="assets/img/hello-world.png" alt="ml" title="ml"/>


<img align='centre' src="assets/img/title.png" alt="ml" title="ml"/>

<img align="left" src="assets/img/about-myself.png" alt="Gopi" title="Gopi" width="600" height="100" />

<img align="right" src="assets/img/gopi.jpg" alt="Gopi" title="Gopi" width="250" height="100" />

<img align='centre' src="assets/img/agenda.png" alt="ml" title="ml"/>


<img align='centre' src="assets/img/more-myself.png" alt="ml" title="ml"/>


<br>
<img align='centre' src="assets/img/ml_everywhere.jpeg" alt="ml" title="ml"/>
<br>

<img align='centre' src="assets/img/ml-applications.png" alt="ml" title="ml"/>

### And more importantly, 

<img align='centre' src="assets/img/nasscom-report.png" alt="ml" title="ml"/>

[source: Nasscom](https://nasscom.in/knowledge-center/publications/talent-demand-supply-report-ai-big-data-analytics)

<img align="centre" src="assets/img/gartnar-report.png" alt="ml" title="ml" width="900" height="600"/>

[source: Gartner](https://www.gartner.com/en/newsroom/press-releases/03-18-2020-gartner-says-strongest-demand-for-ai-talent-comes-from-non-it-departments)

<img align="centre" src="assets/img/post-covid-demand.png" alt="ml" title="ml" width="1000" height="800"/>

[source: AnalyticsIndiaMag](https://analyticsindiamag.com/ai-ml-remains-the-most-in-demand-tech-skill-post-covid/)

<img align="centre" src="assets/img/ml-salary.png" alt="ml" title="ml" width="900" height="600"/>

[source: UpGrad](https://www.upgrad.com/blog/machine-learning-engineer-salary-in-india-freshers-experienced/)

## You are not convinced yet ??

<div align='center'> <h1> What is Machine Learning ? </h1> </div>

<img align="center" src="assets/img/ml-def.png" alt="ml" title="ml" width="700" height="500"/>

<img align="centre" src="assets/img/ml-simple-terms.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/ml-def-brief.png" alt="ml" title="ml" width="900" height="900"/>

## Breaking the Jargons.

<img align="centre" src="assets/img/jargons-1.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/jargons-2.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/jargons-3.png" alt="ml" title="ml" width="900" height="900"/>

## Boring theory, sorry about it. Hang on for a while !

<img align="centre" src="assets/img/types-of-ml.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/supervised.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/supervised-problems.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/unsupervised.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/unsupervised-problems.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/reinforcement.png" alt="ml" title="ml" width="900" height="900"/>

<div> <h2> Talk is cheap, show me the code. </h2> </div>  
<div> <pre> <h3> - Linus Torvalds </h3> </pre> </div>  

<div> <p> <h3> Classifying Coronary Heart Disease using <span style="color:#0000A0"> Logistic Regression</span>. </h3> </p> </div>

In [None]:
# import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import KNNImputer
%matplotlib inline

In [None]:
# load the csv data file
file_path = "data/framingham.csv"
data_df = pd.read_csv(file_path)
data_df.head()

In [None]:
data_df.describe()

In [None]:
# check null column value counts
null_values = data_df.isnull().sum()
null_summary = null_values[null_values > 0]
null_summary

In [None]:
# verify data types of features which has missing values
null_columns = data_df[data_df.columns[null_values > 0]]
null_columns.dtypes

In [None]:
# Impute missing values using K-Nearest Neighbours
col_names = data_df.columns
for feature in col_names:
    data_df[feature+" nan"] = np.where(data_df[feature].isnull(),1,0)

imputer = KNNImputer(n_neighbors=3)
imp_data_df = imputer.fit_transform(data_df)
imp_data_df = pd.DataFrame(imp_data_df, columns=data_df.columns)
imp_data_df.head(10)

In [None]:
# verify null impute and remove nan impute helper columns
glucose_null_indices = data_df[data_df['glucose nan'] == 1].index
glucose_null_top5 = glucose_null_indices[:5]
glucose_null_top5

In [None]:
imp_data_df.loc[glucose_null_top5][['glucose', 'glucose nan']]

In [None]:
# remove the nan impute helper columns
nan_helper_cols = list(imp_data_df.filter(regex='nan'))
nan_helper_cols

In [None]:
imp_data_df = imp_data_df[imp_data_df.columns.drop(nan_helper_cols)]

In [None]:
# check for null again
imp_data_df.isnull().sum()


In [None]:
# remove the outlier
numeric_columns = imp_data_df.select_dtypes(exclude="O")

for feature in numeric_columns:
    q1 = imp_data_df[feature].quantile(0.05)
    q3 = imp_data_df[feature].quantile(0.95)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    med = np.mean(imp_data_df[feature])
    for i in imp_data_df[feature]:
        if i > Upper_tail or i < Lower_tail:
                imp_data_df[feature] = imp_data_df[feature].replace(i, med)

#### Relationship between education and cigsPerDay

In [None]:
graph_1 = imp_data_df.groupby("education", as_index=False).cigsPerDay.mean()

In [None]:
plt.figure(figsize=(10,8))
sns.regplot(x=graph_1["education"], y=graph_1["cigsPerDay"])
plt.title("Graph showing cigsPerDay in every level of education.")
plt.xlabel("education", size=20)
plt.ylabel("cigsPerDay", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

#### Relationship between age and cigsPerDay, totChol, glucose.

In [None]:

graph_3 = imp_data_df.groupby("age").cigsPerDay.mean()
graph_4 = imp_data_df.groupby("age").totChol.mean()
graph_5 = imp_data_df.groupby("age").glucose.mean()

plt.figure(figsize=(12,8))
sns.lineplot(data=graph_3, label="cigsPerDay")
sns.lineplot(data=graph_4, label="totChol")
sns.lineplot(data=graph_5, label="glucose")
plt.title("Graph showing totChol and cigsPerDay in every age group.")
plt.xlabel("age", size=20)
plt.ylabel("count", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

#### Which gender has more risk of coronary heart disease CHD

In [None]:
#checking for which gender has more risk of coronary heart disease CHD

graph_6 = imp_data_df.groupby("male", as_index=False).TenYearCHD.sum()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=graph_6["male"], y=graph_6["TenYearCHD"])
plt.title("Graph showing which gender has more risk of coronary heart disease CHD")
plt.xlabel("0 is female and 1 is male",size=20)
plt.ylabel("total cases", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

#### Which age group has more smokers.



In [None]:
#grouping the necessary features

graph_7 = imp_data_df.groupby("age",as_index=False).currentSmoker.sum()

plt.figure(figsize=(16,8))
sns.barplot(x=graph_7["age"].astype(int), y=graph_7["currentSmoker"])
plt.title("Graph showing which age group has more smokers.")
plt.xlabel("age", size=20)
plt.ylabel("currentSmokers", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

#### Relation between cigsPerDay and risk of coronary heart disease.


In [None]:
graph_8 = imp_data_df.groupby("TenYearCHD", as_index=False).cigsPerDay.mean()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_8["TenYearCHD"], y=graph_8["cigsPerDay"])
plt.title("Graph showing the relation between cigsPerDay and risk of coronary heart disease.")
plt.xlabel("Rick of CHD", size=20)
plt.ylabel("cigsPerDay", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

#### Relation between sysBP and risk of CHD.



In [None]:

graph_9 = imp_data_df.groupby("TenYearCHD", as_index=False).sysBP.mean()

plt.figure(figsize=(12,8))
sns.barplot(x=graph_9["TenYearCHD"], y=graph_9["sysBP"])
plt.title("Graph showing the relation between sysBP and risk of CHD")
plt.xlabel("Rick of CHD", size=20)
plt.ylabel("sysBP", size=20)
plt.xticks(size=12)
plt.yticks(size=12)
plt.show()

## Modelling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
# collecting the features in X
X = imp_data_df.drop(columns=["TenYearCHD"])

# y is the target variable (risk of CHD)
y = imp_data_df["TenYearCHD"]

train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2)

logistic_model = LogisticRegression()

In [None]:
logistic_model.fit(X_scaled, train_y)

In [None]:
score = logistic_model.score(test_X, test_y)

print("accuracy of logistic regression is",score)

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(logistic_model, random_state=1).fit(test_X, test_y)
eli5.show_weights(perm, feature_names = test_X.columns.tolist())

In [None]:
important_features = ['diaBP', 'prevalentHyp', 'age', 'heartRate', 'BMI', 'education', 'male', 'totChol', 'sysBP']

new_data_df = imp_data_df[important_features]

new_train_x = new_data_df
new_train_y = y

Train_X, Test_X, Train_y, Test_y = train_test_split(new_train_x, new_train_y, test_size=0.2)

In [None]:
model = LogisticRegression()

params = {
    "max_iter"          : [30,40,50,100,150,200,],
    "random_state"      : [1,2,3,4,5,6],
    "n_jobs"            : [1,2,3,4,5],
    "penalty"           : ["l1", "l2", "elasticnet", "none"],
    "intercept_scaling" : [1,2,3,4,5],
    "solver"            : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "multi_class"       : ["auto", "ovr", "multinomial"],
    "verbose"           : [0,1,2,3,4,5]
    }

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, cv=5)

random_search.fit(Train_X, Train_y)

In [None]:
random_search.best_estimator_

In [None]:
model = LogisticRegression(max_iter=40, multi_class='multinomial', n_jobs=3,
                   random_state=3, solver='newton-cg', verbose=2)

model.fit(Train_X, Train_y)

score = model.score(Test_X, Test_y)

print("The accuracy of our model is ", score)

<div> <p> <h3> Customer Segmentation using <span style="color:#0000A0"> K-means Clustering </span>. </h3> </p> </div>

In [None]:
# load the csv data file
file_path = "data/Mall_Customers.csv"
clus_data_df = pd.read_csv(file_path)
clus_data_df.head()


In [None]:
#total rows and colums in the dataset
clus_data_df.shape

In [None]:
#checking not null for features
clus_data_df.info()

In [None]:
#Verifying missing values
clus_data_df.isnull().sum()

In [None]:
### Feature sleection for the model
#Considering only 2 features (Annual income and Spending Score) and no Label available
X = clus_data_df.iloc[:, [3,4]].values

In [None]:
#Building the Model
#KMeans Algorithm to decide the optimum cluster number , KMeans++ using Elbow Mmethod to figure out K for KMeans
from sklearn.cluster import KMeans
wcss=[]

for i in range(1,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
#Visualizing the ELBOW method to get the optimal value of K 
plt.plot(range(1,11), wcss)
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
#Model Build
kmeansmodel = KMeans(n_clusters= 5, init='k-means++', random_state=0)
y_kmeans= kmeansmodel.fit_predict(X)

In [None]:
#Visualizing all the clusters 

plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

In [None]:
###Model Interpretation 
#Cluster 1 (Red Color) -> earning high but spending less
#cluster 2 (Blue Colr) -> average in terms of earning and spending 
#cluster 3 (Green Color) -> earning high and also spending high [TARGET SET]
#cluster 4 (cyan Color) -> earning less but spending more
#Cluster 5 (magenta Color) -> Earning less , spending less


######We can put Cluster 3 into some alerting system where email can be send to them on daily basis as these re easy to converse ######
#wherein others we can set like once in a week or once in a month

<img align="centre" src="assets/img/questions.png" alt="ml" title="ml" width="900" height="900"/>

<img align="centre" src="assets/img/contact.png" alt="ml" title="ml" width="900" height="900"/>