In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.000000,1,1.43,2.0,1.0
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.000000,2,1.82,2.0,0.0
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.000000,0,0.00,1.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.000000,1,1.25,1.0,0.0
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.000000,1,1.29,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,1314410,19/08/2017,19/08/2017,45-49,F,109,111,114,1129773,252,358.189997,13,2.00,,
1139,1314411,19/08/2017,19/08/2017,45-49,F,110,111,116,637549,120,173.880003,3,0.00,,
1140,1314412,19/08/2017,19/08/2017,45-49,F,111,113,117,151531,28,40.289999,2,0.00,,
1141,1314414,17/08/2017,17/08/2017,45-49,F,113,114,117,790253,135,198.710000,8,2.00,,


### As we can see, the data has structuring issues from row 761. so lets restructure

In [3]:
#splitting the dataset to two dataframes, df2 with missing values
df1 = df[0:761]
df2 = df[761:]
# restructure the dataset by shifting the columns to match between df1 and df2
c = list(df2)
for x in range(12):
    c[x+1] = c[x+3]
    
df2.columns = c
# further restructuring
df2 = df2.iloc[:, :-2]
df2.rename(columns={'campaign_id': 'reporting_start','fb_campaign_id': 'reporting_end'}, inplace=True)

df2.insert(3, 'campaign_id',np.NaN)
df2.insert(4,'fb_campaign_id',np.NaN)

In [4]:
df2.head()

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
761,1121594,26/08/2017,26/08/2017,,,45-49,M,10,14,14,426500,72,128.279999,4,1.0
762,1121597,30/08/2017,30/08/2017,,,45-49,M,15,21,19,54237,7,10.78,2,1.0
763,1121598,30/08/2017,30/08/2017,,,45-49,M,15,19,18,506916,89,133.699999,2,2.0
764,1121599,30/08/2017,30/08/2017,,,45-49,M,15,17,18,250960,42,64.88,2,0.0
765,1121601,30/08/2017,30/08/2017,,,45-49,M,16,20,18,2286228,353,603.380002,16,7.0


In [5]:
# appending to get final dataframe
df = df1.append(df2, ignore_index=True)
df.head() 

Unnamed: 0,ad_id,reporting_start,reporting_end,campaign_id,fb_campaign_id,age,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion
0,708746,17/08/2017,17/08/2017,916,103916,30-34,M,15,17,17,7350.0,1,1.43,2.0,1.0
1,708749,17/08/2017,17/08/2017,916,103917,30-34,M,16,19,21,17861.0,2,1.82,2.0,0.0
2,708771,17/08/2017,17/08/2017,916,103920,30-34,M,20,25,22,693.0,0,0.0,1.0,0.0
3,708815,30/08/2017,30/08/2017,916,103928,30-34,M,28,32,32,4259.0,1,1.25,1.0,0.0
4,708818,17/08/2017,17/08/2017,916,103928,30-34,M,28,33,32,4133.0,1,1.29,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ad_id                1143 non-null   int64  
 1   reporting_start      1143 non-null   object 
 2   reporting_end        1143 non-null   object 
 3   campaign_id          761 non-null    object 
 4   fb_campaign_id       761 non-null    object 
 5   age                  1143 non-null   object 
 6   gender               1143 non-null   object 
 7   interest1            1143 non-null   object 
 8   interest2            1143 non-null   object 
 9   interest3            1143 non-null   int64  
 10  impressions          1143 non-null   float64
 11  clicks               1143 non-null   int64  
 12  spent                1143 non-null   float64
 13  total_conversion     1143 non-null   float64
 14  approved_conversion  1143 non-null   float64
dtypes: float64(4), int64(3), object(8)
mem

In [7]:
df.isna().sum()

ad_id                    0
reporting_start          0
reporting_end            0
campaign_id            382
fb_campaign_id         382
age                      0
gender                   0
interest1                0
interest2                0
interest3                0
impressions              0
clicks                   0
spent                    0
total_conversion         0
approved_conversion      0
dtype: int64

### Feature Engineering

In [8]:
df.drop(['reporting_start','reporting_end'],inplace=True,axis=1)

In [9]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df['gender'] = lb.fit_transform(df['gender']) # label encode gender

In [10]:
# changing these variables to the proper format of an integer
df['total_conversion'] = df['total_conversion'].astype(int)
df['approved_conversion'] = df['approved_conversion'].astype(int)
df['impressions'] = df['impressions'].astype(int)

In [11]:
# get dummies for age
df = pd.concat([df,pd.get_dummies(df['age'],prefix='age')],axis=1)
df.drop('age',inplace=True,axis=1)

In [12]:
# The dataframe should look good now. 
df.head()

Unnamed: 0,ad_id,campaign_id,fb_campaign_id,gender,interest1,interest2,interest3,impressions,clicks,spent,total_conversion,approved_conversion,age_30-34,age_35-39,age_40-44,age_45-49
0,708746,916,103916,1,15,17,17,7350,1,1.43,2,1,1,0,0,0
1,708749,916,103917,1,16,19,21,17861,2,1.82,2,0,1,0,0,0
2,708771,916,103920,1,20,25,22,693,0,0.0,1,0,1,0,0,0
3,708815,916,103928,1,28,32,32,4259,1,1.25,1,0,1,0,0,0
4,708818,916,103928,1,28,33,32,4133,1,1.29,1,1,1,0,0,0


In [13]:
# Working on the missing data
# imputation of missing values
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler
X = pd.DataFrame(KNN(k=3).fit_transform(df))
X.columns = df.columns
X.index = df.index

Imputing row 1/1143 with 0 missing, elapsed time: 0.422
Imputing row 101/1143 with 0 missing, elapsed time: 0.422
Imputing row 201/1143 with 0 missing, elapsed time: 0.422
Imputing row 301/1143 with 0 missing, elapsed time: 0.422
Imputing row 401/1143 with 0 missing, elapsed time: 0.422
Imputing row 501/1143 with 0 missing, elapsed time: 0.422
Imputing row 601/1143 with 0 missing, elapsed time: 0.422
Imputing row 701/1143 with 0 missing, elapsed time: 0.422
Imputing row 801/1143 with 2 missing, elapsed time: 0.422
Imputing row 901/1143 with 2 missing, elapsed time: 0.438
Imputing row 1001/1143 with 2 missing, elapsed time: 0.438
Imputing row 1101/1143 with 2 missing, elapsed time: 0.453


In [14]:
X.isna().sum()

ad_id                  0
campaign_id            0
fb_campaign_id         0
gender                 0
interest1              0
interest2              0
interest3              0
impressions            0
clicks                 0
spent                  0
total_conversion       0
approved_conversion    0
age_30-34              0
age_35-39              0
age_40-44              0
age_45-49              0
dtype: int64

In [15]:
X['campaign_id'] = X['campaign_id'].astype(int)
X['fb_campaign_id'] = X['fb_campaign_id'].astype(int)
X['campaign_id'] = X['campaign_id'].replace(1177,1178) 

### MODELLING

In [16]:
df = X.copy()

df.spent=df.spent.astype(int)
df.interest1=df.interest1.astype(int)
df.interest2=df.interest2.astype(int)
df.interest3=df.interest3.astype(int)
df.campaign_id=df.campaign_id.astype('category')

In [17]:
df.dtypes

ad_id                   float64
campaign_id            category
fb_campaign_id            int32
gender                  float64
interest1                 int32
interest2                 int32
interest3                 int32
impressions             float64
clicks                  float64
spent                     int32
total_conversion        float64
approved_conversion     float64
age_30-34               float64
age_35-39               float64
age_40-44               float64
age_45-49               float64
dtype: object

In [19]:

df['approved_conversion'] = df['approved_conversion'].replace([range(2,22)], 1)

df.approved_conversion=df.approved_conversion.astype('category')
df['approved_conversion'].value_counts()

# to have class balance. 

1.0    584
0.0    559
Name: approved_conversion, dtype: int64

In [20]:
from sklearn.metrics import  classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [22]:
# Taking the log of the continious variables to mitigate kurtosis and skewdness as much as possible

col= [['interest1','interest2','interest3']]
for cols in col:
    df[cols] = np.log(df[cols])
    df[cols] = np.log(df[cols])

In [24]:
x = df[[ 'campaign_id','interest1','interest2','interest3','gender','age_30-34','age_35-39','age_40-44','age_45-49']]
y = df['approved_conversion']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)


logmodel = LogisticRegression()
logmodel= RFE(logmodel, 9)
logmodel.fit(X_train,y_train)



RFE(estimator=LogisticRegression(), n_features_to_select=9)

In [26]:
predictions = logmodel.predict(X_test)

In [27]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         0.0       0.59      0.59      0.59       167
         1.0       0.61      0.62      0.62       176

    accuracy                           0.60       343
   macro avg       0.60      0.60      0.60       343
weighted avg       0.60      0.60      0.60       343



In [28]:
print(logmodel.ranking_) ## For RFE 

[1 1 1 1 1 1 1 1 1]


In [29]:
print(logmodel.support_) ## For RFE

[ True  True  True  True  True  True  True  True  True]
