In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

### Read our dataset

In [2]:
df = pd.read_csv("Advertising_Data_Set.csv")
df

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,Mega,54.732757
1,13.0,9.237765,2.409567,Mega,46.677897
2,41.0,15.886446,2.913410,Mega,150.177829
3,83.0,30.020028,6.922304,Mega,298.246340
4,15.0,8.437408,1.405998,Micro,56.594181
...,...,...,...,...,...
4567,26.0,4.472360,0.717090,Micro,94.685866
4568,71.0,20.610685,6.545573,Nano,249.101915
4569,44.0,19.800072,5.096192,Micro,163.631457
4570,71.0,17.534640,1.940873,Macro,253.610411


### Check Duplicated

In [3]:
df.duplicated().sum()

0

### Investigate the dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572 entries, 0 to 4571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   TV            4562 non-null   float64
 1   Radio         4568 non-null   float64
 2   Social Media  4566 non-null   float64
 3   Influencer    4572 non-null   object 
 4   Sales         4566 non-null   float64
dtypes: float64(4), object(1)
memory usage: 178.7+ KB


### Check missing values

In [5]:
df.isnull().mean() * 100

TV              0.218723
Radio           0.087489
Social Media    0.131234
Influencer      0.000000
Sales           0.131234
dtype: float64

In [6]:
df.dropna(inplace = True  )

In [7]:
df.isnull().sum()

TV              0
Radio           0
Social Media    0
Influencer      0
Sales           0
dtype: int64

In [8]:
df["Influencer"].value_counts()

Mega     1152
Micro    1148
Nano     1134
Macro    1112
Name: Influencer, dtype: int64

### Encode the influencer column

### Mega  --> larger than million follower
### Macro --> between 100K and a million followers.
### Micro --> between 10K-100K followers.
### Nano  -->  below 10K


In [9]:
Infulencer_Dict = { "Nano": 1 ,
                    "Micro":2 ,
                    "Macro":3 ,
                    "Mega" : 4
    
    
    
}

In [10]:
df["Influencer"] = df["Influencer"].map(Infulencer_Dict)

In [11]:
df["Influencer"]

0       4
1       4
2       4
3       4
4       2
       ..
4567    2
4568    1
4569    2
4570    3
4571    2
Name: Influencer, Length: 4546, dtype: int64

### Remove Outliers

In [12]:
from datasist.structdata import detect_outliers

In [13]:
df.columns

Index(['TV', 'Radio', 'Social Media', 'Influencer', 'Sales'], dtype='object')

In [14]:
idx_out = detect_outliers(df , 0 ,['TV', 'Radio', 'Social Media'] )

In [15]:
idx_out

[4331,
 140,
 212,
 468,
 504,
 558,
 603,
 867,
 1235,
 1363,
 1412,
 1468,
 1872,
 1929,
 2050,
 2091,
 2096,
 2295,
 2346,
 2418,
 2772,
 3094,
 3302,
 3500,
 3786,
 3801,
 4178,
 4279,
 4375]

In [16]:
df.drop(idx_out , axis = 0 , inplace = True)

In [17]:
df

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
0,16.0,6.566231,2.907983,4,54.732757
1,13.0,9.237765,2.409567,4,46.677897
2,41.0,15.886446,2.913410,4,150.177829
3,83.0,30.020028,6.922304,4,298.246340
4,15.0,8.437408,1.405998,2,56.594181
...,...,...,...,...,...
4567,26.0,4.472360,0.717090,2,94.685866
4568,71.0,20.610685,6.545573,1,249.101915
4569,44.0,19.800072,5.096192,2,163.631457
4570,71.0,17.534640,1.940873,3,253.610411


### Split the data to train and test

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x=df.drop("Sales" , axis = 1 )
y = df["Sales"]

In [20]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=7)

### Skill the numerical data

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler  = StandardScaler()

In [23]:
df.columns

Index(['TV', 'Radio', 'Social Media', 'Influencer', 'Sales'], dtype='object')

In [24]:
columns_to_be_scaled = ['TV', 'Radio', 'Social Media']

In [25]:
x_train[columns_to_be_scaled] = scaler.fit_transform(x_train[columns_to_be_scaled])
x_test[columns_to_be_scaled] = scaler.transform(x_test[columns_to_be_scaled])

### Train the model

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
model = LinearRegression()

In [28]:
x_train.describe()

Unnamed: 0,TV,Radio,Social Media,Influencer
count,3613.0,3613.0,3613.0,3613.0
mean,-1.769965e-17,-3.834925e-17,1.415972e-16,2.502906
std,1.000138,1.000138,1.000138,1.113346
min,-1.67832,-1.874948,-1.527813,1.0
25%,-0.8767843,-0.7955485,-0.8193551,2.0
50%,-0.03708026,-0.02932791,-0.1130727,2.0
75%,0.8789606,0.7808938,0.6922328,4.0
max,1.756833,2.993213,2.960681,4.0


In [32]:
model.fit(x_train , y_train)

In [33]:
y_pred = model.predict(x_test)

### Evaluate the model

In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
mean_squared_error(y_test , y_pred)

8.884015326432257

In [36]:
df.columns

Index(['TV', 'Radio', 'Social Media', 'Influencer', 'Sales'], dtype='object')

In [39]:
df.describe()

Unnamed: 0,TV,Radio,Social Media,Influencer,Sales
count,4517.0,4517.0,4517.0,4517.0,4517.0
mean,53.873146,18.057647,3.276354,2.499447,191.734768
std,26.058692,9.60256,2.136599,1.120235,92.854862
min,10.0,0.000684,3.1e-05,1.0,31.199409
25%,32.0,10.453755,1.523415,2.0,112.19288
50%,53.0,17.778746,3.041314,2.0,188.116376
75%,76.0,25.44111,4.774722,4.0,271.126009
max,100.0,47.116293,9.638966,4.0,364.079751


### Test Instances

In [77]:
Test_instance = [80 , 25 , 7 ]
Test_instance = scaler.transform([Test_instance])




### append the influencer

In [78]:
Test_instance = np.append(Test_instance , 3 )

In [79]:
Test_instance

array([0.99346566, 0.70807521, 1.73181265, 3.        ])

In [81]:
model.predict([Test_instance])



array([284.82195675])

### Save your model for future use

In [82]:
import joblib

In [83]:
joblib.dump(model , "model.pkl")
joblib.dump(scaler , "scaler.pkl")

['scaler.pkl']