# Finline task solution - Python 3.6 - Ubuntu 17.10

In [1]:
import pandas as pd
import numpy as np
import csv

from plotly import __version__
import plotly.plotly as py
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
    
df = pd.read_csv(
    "test_campaign.csv", 
    error_bad_lines=False, 
    encoding='windows-1251', 
    sep=';')

## Exploratory analysis

In [2]:
df.describe()

Unnamed: 0,Visits,Clicks,Conversions,Revenue,CTR,CR,CV,ROI,EPV,EPC
count,3054.0,3054.0,3054.0,3054.0,3054.0,0.0,0.0,0.0,0.0,0.0
mean,62.151277,8.238048,0.331369,10.246234,0.211907,,,,,
std,408.650572,56.650005,2.071491,63.773928,0.322609,,,,,
min,1.0,0.0,0.0,0.0,0.0,,,,,
25%,2.0,0.0,0.0,0.0,0.0,,,,,
50%,8.0,1.0,0.0,0.0,0.071429,,,,,
75%,28.0,4.0,0.0,0.0,0.285714,,,,,
max,11610.0,2562.0,54.0,1622.0,5.0,,,,,


In [3]:
# defining column presets for further use 

num_cols = df.columns.values[0:4]
float_cols = df.columns.values[4:8]
all_cols = list(df.columns)

all_cols

['Domain ID',
 'Visits',
 'Clicks',
 'Conversions',
 'Revenue',
 'Cost',
 'Profit',
 'CTR',
 'CR',
 'CV',
 'ROI',
 'EPV',
 'EPC']

In [4]:
init_notebook_mode(connected=True)


data = []


for col in all_cols[1:]:
    data.append(go.Box(
        y=df[col],
        name = col,
    ))

iplot(data)

## Cleaning up

In [5]:
# Cost and profit object datatype looks like it needs cleanup

df.dtypes

Domain ID       object
Visits           int64
Clicks           int64
Conversions      int64
Revenue        float64
Cost            object
Profit          object
CTR            float64
CR             float64
CV             float64
ROI            float64
EPV            float64
EPC            float64
dtype: object

### Data validation

In [6]:
# Checking if domain id's are valid

df[ (df['Domain ID'].str.isnumeric() == False)|( df['Domain ID'].isna() )|(df['Domain ID'].isnull()) ]

Unnamed: 0,Domain ID,Visits,Clicks,Conversions,Revenue,Cost,Profit,CTR,CR,CV,ROI,EPV,EPC
36,,722,1,0,0.0,112.05,-112.05,0.001385,,,,,
234,[did],105,28,5,479.0,17.1988,461.8012,0.266667,,,,,
2790,222034?utm_source=publisher,1,0,0,0.0,0.18,-0.18,0.0,,,,,


In [7]:
# Cleaning up

df.loc[36, 'Domain ID'] = '010101'
df.loc[2790, 'Domain ID'] = '222035'
df.loc[234, 'Domain ID'] = '222222'

new_vals = df.loc[36, 'Domain ID'], df.loc[2790, 'Domain ID'], df.loc[234, 'Domain ID']

# Checking that no duplicate keys were created

df[df['Domain ID'].isin(new_vals)] 

Unnamed: 0,Domain ID,Visits,Clicks,Conversions,Revenue,Cost,Profit,CTR,CR,CV,ROI,EPV,EPC
36,10101,722,1,0,0.0,112.05,-112.05,0.001385,,,,,
234,222222,105,28,5,479.0,17.1988,461.8012,0.266667,,,,,
2790,222035,1,0,0,0.0,0.18,-0.18,0.0,,,,,


In [8]:
# Filling missing values with 0

df.fillna(0, inplace=True)

# Creating new column for marking faulty columns

df['Faulty'] = ''

# Converting numeric id's to integer

for col in num_cols:
    df[col] = df[col].astype(int)
    
# Try to simply cast the types for floats

try: 
    df['Cost'].astype(float)
    df['Profit'] = df['Profit'].astype(float)
except:
    print("Can't convert Cost and Profit to float")

Can't convert Cost and Profit to float


## Fixing Cost and profit

In [9]:
df[~df['Cost'].str.match('((-|)\d+\.*\d*)')][:5]

Unnamed: 0,Domain ID,Visits,Clicks,Conversions,Revenue,Cost,Profit,CTR,CR,CV,ROI,EPV,EPC,Faulty
227,197814,108,6,0,0.0,дек.00,-12.96,0.055556,0.0,0.0,0.0,0.0,0.0,
232,165950,106,37,2,60.0,дек.00,47.28,0.349057,0.0,0.0,0.0,0.0,0.0,
263,207021,95,11,0,0.0,нояб.00,-11.4,0.115789,0.0,0.0,0.0,0.0,0.0,
266,233836,94,1,0,0.0,нояб.00,-11.28,0.010638,0.0,0.0,0.0,0.0,0.0,
295,223353,87,5,0,0.0,окт.00,-10.44,0.057471,0.0,0.0,0.0,0.0,0.0,


In [10]:
df.at[~df['Cost'].str.match('((-|)\d+\.*\d*)'), 'Faulty'] = 'Cost'
df.at[~df['Cost'].str.match('((-|)\d+\.*\d*)'), 'Cost'] = -1
df['Cost'] = df['Cost'].astype(float)

In [11]:
df[(df['Cost'] < 0) & (df['Cost'] != -1 )]

Unnamed: 0,Domain ID,Visits,Clicks,Conversions,Revenue,Cost,Profit,CTR,CR,CV,ROI,EPV,EPC,Faulty


In [12]:
df.dtypes

# Cost is converted, but values not fixed.

Domain ID        int64
Visits           int64
Clicks           int64
Conversions      int64
Revenue        float64
Cost           float64
Profit          object
CTR            float64
CR             float64
CV             float64
ROI            float64
EPV            float64
EPC            float64
Faulty          object
dtype: object

In [13]:
# Converting to float with stages in string

df['Profit'].astype(str)
df[~df['Profit'].str.match('((-|)\d+\.*\d*)')]
df.at[~df['Profit'].str.match('((-|)\d+\.*\d*)'), 'Faulty'] = 'Profit'
df.at[~df['Profit'].str.match('((-|)\d+\.*\d*)'), 'Profit'] = -1
df['Profit'] = df['Profit'].astype(float)

In [14]:
# Checking for overlaps

df[(df['Faulty'] == 'Profit') & (df['Faulty'] == 'Cost')].shape[0] == 0

True

In [15]:
df.dtypes

# Profit is converted, but values not fixed.

Domain ID        int64
Visits           int64
Clicks           int64
Conversions      int64
Revenue        float64
Cost           float64
Profit         float64
CTR            float64
CR             float64
CV             float64
ROI            float64
EPV            float64
EPC            float64
Faulty          object
dtype: object

In [16]:
for idx,row in df[df['Faulty'] == 'Cost'].iterrows():   
    row['Cost'] = row['Profit'] - row['Revenue']

In [17]:
for idx,row in df[df['Faulty'] == 'Profit'].iterrows():   
    row['Profit'] = row['Revenue'] - row['Cost']

In [18]:
df = df.groupby('Domain ID').sum()

In [19]:
df['Profit_abs'] = df['Profit'].apply(abs)

In [20]:
df.eval("""
    ....: Profit_minus_cost = Profit - Cost
    ....: CR = Conversions / Visits
    ....: CV = Cost / Visits
    ....: ROI = Profit / Cost
    ....: EPC = Profit / Clicks
    ....: EPV = Profit / Visits
    """, inplace=True)

In [21]:
# Not effective sources

df.describe()

Unnamed: 0,Visits,Clicks,Conversions,Revenue,Cost,Profit,CTR,CR,CV,ROI,EPV,EPC,Profit_abs,Profit_minus_cost
count,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0,3054.0
mean,62.151277,8.238048,0.331369,10.246234,7.728896,0.666293,0.211907,0.007531,0.086259,-0.173457,0.072763,-inf,10.362569,-7.062603
std,408.650572,56.650005,2.071491,63.773928,60.806674,41.886341,0.322609,0.060129,0.106418,15.317877,1.847472,,40.589305,82.580037
min,1.0,0.0,0.0,0.0,-1.0,-1060.24,0.0,0.0,-0.166667,-139.6941,-0.22,-inf,0.12,-2453.48
25%,2.0,0.0,0.0,0.0,-1.0,-3.21015,0.0,0.0,-0.028571,-1.0,-0.17,-inf,0.34,-3.32
50%,8.0,1.0,0.0,0.0,0.18,-0.89,0.071429,0.0,0.15,-1.0,-0.158571,-2.075,1.3562,-1.19
75%,28.0,4.0,0.0,0.0,0.808275,-0.2008,0.285714,0.0,0.165175,1.9518,-0.14375,-0.39,5.28,-0.36
max,11610.0,2562.0,54.0,1622.0,1653.27,762.38,5.0,2.0,0.22,332.333333,59.82,63.66,1060.24,444.6024


### Trying to fit probality distributions to data

In [35]:
import plotly.plotly as py
import plotly.figure_factory as ff

import numpy as np

x1 = df['Profit'].tolist()
  
hist_data = [x1]

group_labels = ['Profit']

colors = ['#3A4750', '#F64E8B']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, bin_size=.5, curve_type='normal', colors=colors)

# Add title
fig['layout'].update(title='Profit fit with Normal Distribution')
iplot(fig, filename='Profit fit with Normal Curve')


In [36]:
# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors=colors)

# Plot!
iplot(fig, filename='Distplot with Normal Curve')

In [24]:
from scipy.stats import norm, zscore

def sample_power_probtest(p1, p2, power=0.8, sig=0.05):
    z = norm.isf([sig/2]) #two-sided t test
    zp = -1 * norm.isf([power]) 
    d = (p1-p2)
    s =2*((p1+p2) /2)*(1-((p1+p2) /2))
    n = s * ((zp + z)**2) / (d**2)
    return int(round(n[0]))

def sample_power_difftest(d, s, power=0.8, sig=0.05):
    z = norm.isf([sig/2])
    zp = -1 * norm.isf([power])
    n = s * ((zp + z)**2) / (d**2)
    return int(round(n[0]))


n = sample_power_difftest(0.1, 0.5, power=0.8, sig=0.05)


In [25]:
# Create a trace
trace = go.Scatter(
    x = df['CTR'],
    y = df['CR'],
    mode = 'markers'
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [26]:
std = df.std(axis=0)
std

Visits               408.650572
Clicks                56.650005
Conversions            2.071491
Revenue               63.773928
Cost                  60.806674
Profit                41.886341
CTR                    0.322609
CR                     0.060129
CV                     0.106418
ROI                   15.317877
EPV                    1.847472
EPC                         NaN
Profit_abs            40.589305
Profit_minus_cost     82.580037
dtype: float64

In [27]:
mean = df.mean()
mean, mean['ROI']

(Visits               62.151277
 Clicks                8.238048
 Conversions           0.331369
 Revenue              10.246234
 Cost                  7.728896
 Profit                0.666293
 CTR                   0.211907
 CR                    0.007531
 CV                    0.086259
 ROI                  -0.173457
 EPV                   0.072763
 EPC                       -inf
 Profit_abs           10.362569
 Profit_minus_cost    -7.062603
 dtype: float64, -0.17345684165759229)

In [28]:
skew = df.skew()
skew

Visits               18.005858
Clicks               32.505365
Conversions          15.565545
Revenue              15.401142
Cost                 16.937008
Profit               -2.825435
CTR                   2.868474
CR                   19.274480
CV                   -0.724602
ROI                   5.774648
EPV                  18.871897
EPC                        NaN
Profit_abs           13.359662
Profit_minus_cost   -17.131640
dtype: float64

In [29]:
kurt = df.kurt()
kurt

Visits                399.503876
Clicks               1375.671324
Conversions           314.997855
Revenue               309.781557
Cost                  351.733651
Profit                231.675668
CTR                    19.436968
CR                    499.646984
CV                     -1.164019
ROI                   142.058351
EPV                   473.136382
EPC                          NaN
Profit_abs            248.881645
Profit_minus_cost     393.390348
dtype: float64

In [30]:
df[['CR', 'CV', 'ROI', 'EPV', 'EPC']].describe()

Unnamed: 0,CR,CV,ROI,EPV,EPC
count,3054.0,3054.0,3054.0,3054.0,3054.0
mean,0.007531,0.086259,-0.173457,0.072763,-inf
std,0.060129,0.106418,15.317877,1.847472,
min,0.0,-0.166667,-139.6941,-0.22,-inf
25%,0.0,-0.028571,-1.0,-0.17,-inf
50%,0.0,0.15,-1.0,-0.158571,-2.075
75%,0.0,0.165175,1.9518,-0.14375,-0.39
max,2.0,0.22,332.333333,59.82,63.66


In [31]:
from scipy.stats import norm
# mean['ROI'], variance['ROI'], skew['ROI'], kurt['ROI'] = norm.stats(moments='mvsk')

# my_norm = norm(loc=mean['ROI'], scale=variance['ROI'])

# x = np.linspace(my_norm.ppf(0.01), my_norm.ppf(0.99), 100)
# y = norm.ppf(x)

xmin = df['Profit'].min()
xmax = df['Profit'].max()

x = np.linspace(xmin, xmax, 1000)

mu, std = norm.fit(df['ROI'])
p = norm.pdf(x, mu, std)

trace1 = go.Scatter(
    x = x,
    y = p,
    mode = 'lines+markers',
    name = 'lines+markers'
)

data = [trace1]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [32]:
x

array([ -1.06024000e+03,  -1.05841556e+03,  -1.05659111e+03,
        -1.05476667e+03,  -1.05294222e+03,  -1.05111778e+03,
        -1.04929333e+03,  -1.04746889e+03,  -1.04564444e+03,
        -1.04382000e+03,  -1.04199556e+03,  -1.04017111e+03,
        -1.03834667e+03,  -1.03652222e+03,  -1.03469778e+03,
        -1.03287333e+03,  -1.03104889e+03,  -1.02922444e+03,
        -1.02740000e+03,  -1.02557556e+03,  -1.02375111e+03,
        -1.02192667e+03,  -1.02010222e+03,  -1.01827778e+03,
        -1.01645333e+03,  -1.01462889e+03,  -1.01280444e+03,
        -1.01098000e+03,  -1.00915556e+03,  -1.00733111e+03,
        -1.00550667e+03,  -1.00368222e+03,  -1.00185778e+03,
        -1.00003333e+03,  -9.98208889e+02,  -9.96384444e+02,
        -9.94560000e+02,  -9.92735556e+02,  -9.90911111e+02,
        -9.89086667e+02,  -9.87262222e+02,  -9.85437778e+02,
        -9.83613333e+02,  -9.81788889e+02,  -9.79964444e+02,
        -9.78140000e+02,  -9.76315556e+02,  -9.74491111e+02,
        -9.72666667e+02,

In [33]:
p = norm.cdf(x, mu, std)

trace1 = go.Scatter(
    x = x,
    y = p,
    mode = 'lines+markers',
    name = 'lines+markers'
)

data = [trace1]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [34]:
my_norm = norm(loc=mean['ROI'], scale=variance['ROI'])

import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

NameError: name 'variance' is not defined

In [None]:
def is_string(i):
    return isinstance(i, str)



df['Cost type'] = df['Cost'].apply(is_string)
df['Profit type'] = df['Profit'].apply(is_string)

In [None]:
df[['Cost type', 'Cost']].ix[0, 'Cost']

In [None]:
df[df['Profit type'] != True]

In [None]:
df.select_dtypes(include=['object'])

In [None]:
df["Cost"].ast
pandas.api.df['Cost'].str.search("{")

In [None]:
df['Cost'].str.find("{")

In [None]:
df.applymap(lambda x: isinstance(x, (int, float)))

In [None]:
non_num = {}

comp_cols = []

for col in all_cols:
    str_col = str(col)
    new_key = str_col + "_comp"
    df[new_key] = df.applymap(lambda x: isinstance(x, (int, float)))
    comp_cols.append(new_key)    

In [None]:
for col in all_cols:
    df[col] = df[col].astype(str)    

In [None]:
df[ df['is_numeric'] > 0 ] 

In [None]:
for key, item in non_num.items():
    print (key, item.shape)

In [None]:
for df in non_num:
    print (df.shape)

In [None]:
non_num['Domain ID']

# Cleaning up domain id

df.loc[36, 'Domain ID'] = '0'
df.loc[234, 'Domain ID'] = '123123'
df.loc[2790, 'Domain ID'] = '222035'
df.loc[234, 'Domain ID'], df.loc[2790, 'Domain ID'], df.loc[36, 'Domain ID'] 

In [None]:
for column in all_cols:
   df[column] = pd.to_numeric(df[column], downcast='signed')

In [None]:
for i in range(len(nums_cols)):
    col = nums_cols[i]
    print(col)
    df[col + "is numeric"] = df[col].str.isnumeric

In [None]:

import re

num_ch = re.compile('[\d\s\.]*')
cols = df.columns 

In [None]:
nums_cols = [
    "Domain ID",
    "ID",
    "Visits",
    "Clicks",
    "Conversions",
    "Revenues"
]

import re


        

In [None]:
for name, col in df.transpose().iterrows():
    str_col  = str(col).strip("")
    if not num_ch.search(str_col):
        broken.apppend(({name: str_col})
                       col

In [None]:
df.fillna(0, inplace=True)

In [None]:
nums_cols = [
    "Domain ID",
    "ID",
    "Visits",
    "Clicks",
    "Conversions",
    "Revenues"
]

for item in nums_cols:
    df[item].astype(int, inplace=True)