# Effective Targetting of Advertisments

In [2]:
#import dependencies
import pandas as pd
import numpy as np

In [4]:
# import advertising csv & view data
ad_data = pd.read_csv("../resources/advertising_ef.csv")
ad_data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Timestamp,Clicked on Ad
0,68.95,35.0,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,Female,Tunisia,27-03-2016 00:53,0
1,,31.0,68441.85,193.77,Monitored national standardization,West Jodi,Male,Nauru,04-04-2016 01:39,0
2,69.47,26.0,59785.94,236.5,Organic bottom-line service-desk,Davidton,Female,San Marino,13-03-2016 20:35,0
3,74.15,29.0,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,Male,Italy,10-01-2016 02:31,0
4,68.37,35.0,73889.99,225.58,Robust logistical utilization,South Manuel,Female,Iceland,03-06-2016 03:36,0


In [5]:
# view data info: columns, null count, data types, etc. 
ad_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1005 non-null   float64
 1   Age                       998 non-null    float64
 2   Area Income               998 non-null    float64
 3   Daily Internet Usage      1005 non-null   float64
 4   Ad Topic Line             1009 non-null   object 
 5   City                      998 non-null    object 
 6   Gender                    1009 non-null   object 
 7   Country                   996 non-null    object 
 8   Timestamp                 1009 non-null   object 
 9   Clicked on Ad             1009 non-null   int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 79.0+ KB


- Total of 10 columns and 1009 entries
- Dtypes: float64, int64, and objects
- Nulls are present in six columns 

## Data Preprocessing

In [6]:
# view the total amount of nulls in all 10 columns 
ad_data.isna().sum()

Daily Time Spent on Site     4
Age                         11
Area Income                 11
Daily Internet Usage         4
Ad Topic Line                0
City                        11
Gender                       0
Country                     13
Timestamp                    0
Clicked on Ad                0
dtype: int64

- replace the null values with its median instead of automatic deletion

In [8]:
#rename "Daily Time Spent on Site" Column 
ad_data.rename(columns={'Daily Time Spent on Site':'Daily_Time_Spent_on_Site'}, inplace=True)

# print describe on the column to find the median and view the distribution
print(ad_data['Daily_Time_Spent_on_Site'].describe())

count    1005.000000
mean       65.009463
std        15.879997
min        32.600000
25%        51.300000
50%        68.370000
75%        78.570000
max        91.430000
Name: Daily_Time_Spent_on_Site, dtype: float64


In [9]:
# fill null data on 'Daily Time Spent on Site':'Daily_Time_Spent_on_Site' column with its medians 
ad_data['Daily_Time_Spent_on_Site'].fillna(ad_data['Daily_Time_Spent_on_Site'].median(),inplace=True)

# double check for nulls 
print(ad_data['Daily_Time_Spent_on_Site'].isna().sum())

0


In [10]:
# find the median of 'Age' column for null replacement
print(ad_data['Age'].describe())

count    998.000000
mean      35.962926
std        8.772142
min       19.000000
25%       29.000000
50%       35.000000
75%       42.000000
max       61.000000
Name: Age, dtype: float64


In [12]:
# Fill 'Age' nulls with Median value 
ad_data['Age'].fillna(ad_data['Age'].median(),inplace=True)

# double check for nulls 
print(ad_data['Age'].isna().sum())

0


In [13]:
# Rename 'Area Income' to 'Area_Income'
ad_data.rename(columns={'Area Income':'Area_Income'}, inplace=True)

# print describe on the column to find the median and view the distribution
print(ad_data['Area_Income'].describe())

count      998.000000
mean     54958.573617
std      13381.286752
min      13996.500000
25%      46993.367500
50%      56998.245000
75%      65267.402500
max      79484.800000
Name: Area_Income, dtype: float64


In [17]:
# fill 'Area_Income' Null values with Median
ad_data['Area_Income'].fillna(ad_data['Area_Income'].median(),inplace=True)

# double check for nulls 
print(ad_data['Area_Income'].isna().sum())

0


In [18]:
# round 'Age' values to 2 decimal places
ad_data['Area_Income']= ad_data['Area_Income'].round(2)

# check for decimal places 
print(ad_data['Area_Income'])

0       61833.90
1       68441.85
2       59785.94
3       54806.18
4       73889.99
          ...   
1004    71384.57
1005    67782.17
1006    42415.72
1007    41920.79
1008    29875.80
Name: Area_Income, Length: 1009, dtype: float64


In [19]:
# rename 'Daily Internet Usage' column 
ad_data.rename(columns={'Daily Internet Usage':'Daily_Internet_Usage'}, inplace=True)

# print describe on the column to find the median and view the distribution
print(ad_data['Daily_Internet_Usage'].describe())

count    1005.000000
mean      180.041920
std        43.923438
min       104.780000
25%       138.870000
50%       183.420000
75%       218.800000
max       269.960000
Name: Daily_Internet_Usage, dtype: float64


In [20]:
# fill 'Daily_Internet_Usage' Null values with Median
ad_data['Daily_Internet_Usage'].fillna(ad_data['Daily_Internet_Usage'].median(),inplace=True)

# double check for nulls 
print(ad_data['Daily_Internet_Usage'].isna().sum())

0



- Although it is possible to fill the null values in both 'City' & 'Country', this may affect the integrity of the data. Thus, these null values for 'City' = 11 & 'Country'= 13 will be dropped. 

In [22]:
# drop 'City' null values
ad_data['City'] = ad_data['City'].dropna()

#check for null values
print(ad_data['City'].isna().sum())

11
