# Effective Targetting of Advertisments

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('advertising_ef.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'advertising_ef.csv'

In [None]:
data.info()

# Data Preprocessing

Dealing with Null Values

In [None]:
data.isna().sum()

There are null values present inside some of the columns of the dataset

In [None]:
data.rename(columns={'Daily Time Spent on Site':'Daily_Time_Spent_on_Site'}, inplace=True)

In [None]:
print(data['Daily_Time_Spent_on_Site'].mean())
print(data['Daily_Time_Spent_on_Site'].median())

In [None]:
data['Daily_Time_Spent_on_Site'].fillna(data['Daily_Time_Spent_on_Site'].median(),inplace=True)

Imputed ands replaced the null values of Daily Time Spent On Site with its Median

In [None]:
print(data['Age'].mean())
print(data['Age'].median())

Here we have checked for the mean and median value of Age column to find with which we can replace our null values

In [None]:
data['Age'].fillna(data['Age'].median(),inplace=True)

Since Age cannot be in decimals so we consider the median of Age column to replace null values

Similiarly we check For 'Area Income' column

In [None]:
print(data['Area Income'].mean())
print(data['Area Income'].median())

In [None]:
data['Area Income'].fillna(data['Area Income'].mean(),inplace=True)

Filled the null values with the mean of Area Income and then rounding the numbers upto two decimals

In [None]:
data['Area Income'] = np.around(data['Area Income'],decimals=2)

In [None]:
data['Daily Internet Usage'].fillna(data['Daily Internet Usage'].mean(),inplace=True)

In [None]:
data['City'].value_counts().index[0]

In [None]:
data['City'].fillna(data['City'].value_counts().index[0],inplace=True)

In [None]:
data['Country'].value_counts().index[0]
data['Country'].fillna(data['Country'].value_counts().index[0],inplace=True)

In [None]:
data.isna().sum()

All the Null values have been imputed

Finding the Correlation Betweeen The Columns

In [None]:
data.corr()

In [None]:
import seaborn as sns

In [None]:
sns.heatmap(data.corr())

In [None]:
sns.histplot(data['Age'])

In [None]:
sns.jointplot(x='Age', y='Area Income', data=data, color="blue")

Analysing Joint Plot Between Area Income column and Age Column

In [None]:
sns.jointplot(x='Age', y='Daily Internet Usage', data=data, color='darkgreen')

Analysing Joint Plot Between Daily Internet Usage Column and Age Column

In [None]:
sns.pairplot(data, hue='Clicked on Ad', palette="Reds")

Analysing the relation between each and every variable present in the dataset with target function Clicked On Ad

Checking Whether There is Class imbalance in the Dataset

In [None]:
data['Clicked on Ad'].value_counts()

In [None]:
data.to_csv('Cleaned_Advertisements.csv')

The occurence of '1' is 506 and of '0' is 503. So there is not much difference and there both are equally probable in the dataset 

Here we take our Cleaned Dataset On which we have applied Preprocessing

# Data Modelling

In [None]:
data = pd.read_csv('Cleaned_Advertisements.csv')

In [None]:
data.head()

In [None]:
data.info()

Checking the number of unique values present in column AD Topic Line, City And Country

In [None]:
object_Dtype = ['Ad Topic Line', 'City', 'Country']
data[object_Dtype].describe(include=['O'])

Here we can see these columns have many unique values in them which will make it difficult to find a pattern from these columns to make a suitable model for predictions.

So we reject the columns Ad Topic Line, City, Country 

Now checking for Time Stamp Column which is object Data type.
We Split the column into two different Columns Month and Hour. So that Our model can effectively Target the best timezone trends.

In [None]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Month'] = data['Timestamp'].dt.month
data['Hour'] = data['Timestamp'].dt.hour
data = data.drop(['Timestamp'], axis=1)

In [None]:
data.head()

In [None]:
data.rename(columns={'Unnamed: 0':'Unnamed'}, inplace=True)

Dropped The Unnamed:0 column from the dataset

In [None]:
data.drop('Unnamed',axis=1,inplace=True)

Label Encoding Gender Column using Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])

In [None]:
data.head()

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr())

In [None]:
sns.pairplot(data, hue='Clicked on Ad', palette="Purples")

Our Dataset is a classification Dataset i.e. it predicts values ither '0' or '1' which is not clicked or clicked resp.

So the models which we can apply on our dataset could be :

Desicion Tree Classifier

Logistic Regression

Simple Vector Method (SVM)


From the above methods we choose Decision Tree Classifier & Logistic Regression for our dataset 

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import train_test_split
y = data['Clicked on Ad']
X = data.iloc[:,[0,1,2,3,6,9,10]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
X,y

In [None]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train,y_train)

In [None]:
ypred_train = dt.predict(X_train)
ypred_test = dt.predict(X_test)

In [None]:
ypred_train

In [None]:
ypred_test

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score,classification_report
print(f"Accuracy score is on Train data {accuracy_score(y_train,ypred_train)}")
print(f"\nAccuracy score is on Test data {accuracy_score(y_test,ypred_test)}")
print(f"\nF1 score is on Test data {f1_score(y_test,ypred_test)}")
cm = confusion_matrix(y_test,ypred_test)
print('\nConfusion Matrix : ')
print(cm)

In [None]:
from sklearn.model_selection import cross_val_score 
accuracies_dt = cross_val_score(estimator = dt, X = X_train, y = y_train, cv = 10) 
print("For Decision Tree : ")
print(f"Mean of Accuracy Score Using cross validation is {accuracies_dt.mean()}")
print(f"Standard Deviation of Accuracy Score Using cross validation is {accuracies_dt.std()}")

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

We notice that our model gives 90.42% accuracy on normal LoR

So, we apply hyperparameter tuning **'newton-cg'** on our LoR model to increase the efficiency of our model 

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(solver='newton-cg')
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

So we conclude that after applying hyperparameter tuning on our LoR model we got the accuracy as 96.03%

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
list = [0,1,2,3,6,9,10]
for i in list:
    sns.regplot(x=data.iloc[:,i],y="Clicked on Ad",y_jitter=0.03,data=data,logistic=True,ci=None)
    plt.show()


**So After Training the dataset through different kind of Model we get :**

**From the Decision Tree Classifier Model we get :**
Mean of Accuracy score as 0.93


**From the Logistic Regression Model we get :**
Mean of Accuracy score as 0.90


**From the Logistic Regression Model (Hyperparameter tuning applied) we get :**
Mean of Accuracy score as 0.96

