## Title: Data Review
## Author: Dennis
## Date: Jan 2022

***

## Import Your Data

In [1]:
import numpy as np
from numpy import count_nonzero
from numpy import median
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import random

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols

import datetime
from datetime import datetime, timedelta

import scipy.stats
from collections import Counter

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', titlesize=9)
plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)

random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


## Exploratory Data Analysis

In [2]:
df = pd.read_csv("marketing_data.csv",parse_dates=['Dt_Customer'])

In [3]:
df

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country
0,1826,1970,Graduation,Divorced,84835.00,0,0,2014-06-16,0,189,104,379,111,189,218,1,4,4,6,1,0,0,0,0,0,1,0,Spain
1,1,1961,Graduation,Single,57091.00,0,0,2014-06-15,0,464,5,64,7,0,37,1,7,3,7,5,0,0,0,0,1,1,0,Canada
2,10476,1958,Graduation,Married,67267.00,0,1,2014-05-13,0,134,11,59,15,2,30,1,3,2,5,2,0,0,0,0,0,0,0,USA
3,1386,1967,Graduation,Together,32474.00,1,1,2014-11-05,0,10,0,1,0,0,0,1,1,0,2,7,0,0,0,0,0,0,0,Australia
4,5371,1989,Graduation,Single,21474.00,1,0,2014-08-04,0,6,16,24,11,0,34,2,3,1,2,7,1,0,0,0,0,1,0,Spain
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142,1976,PhD,Divorced,66476.00,0,1,2013-07-03,99,372,18,126,47,48,78,2,5,2,11,4,0,0,0,0,0,0,0,USA
2236,5263,1977,2n Cycle,Married,31056.00,1,0,2013-01-22,99,5,10,13,3,8,16,1,1,0,3,8,0,0,0,0,0,0,0,Spain
2237,22,1976,Graduation,Divorced,46310.00,1,0,2012-03-12,99,185,2,88,15,5,14,2,6,1,5,8,0,0,0,0,0,0,0,Spain
2238,528,1978,Graduation,Married,65819.00,0,0,2012-11-29,99,267,38,701,149,165,63,1,5,4,10,3,0,0,0,0,0,0,0,India


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2240 non-null   int64         
 1   Year_Birth           2240 non-null   int64         
 2   Education            2240 non-null   object        
 3   Marital_Status       2240 non-null   object        
 4   Income               2216 non-null   float64       
 5   Kidhome              2240 non-null   int64         
 6   Teenhome             2240 non-null   int64         
 7   Dt_Customer          2240 non-null   datetime64[ns]
 8   Recency              2240 non-null   int64         
 9   MntWines             2240 non-null   int64         
 10  MntFruits            2240 non-null   int64         
 11  MntMeatProducts      2240 non-null   int64         
 12  MntFishProducts      2240 non-null   int64         
 13  MntSweetProducts     2240 non-nul

In [5]:
df.describe(include='all')

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Response,Complain,Country
count,2240.0,2240.0,2240,2240,2216.0,2240.0,2240.0,2240,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240
unique,,,5,8,,,,663,,,,,,,,,,,,,,,,,,,,8
top,,,Graduation,Married,,,,2012-08-31 00:00:00,,,,,,,,,,,,,,,,,,,,Spain
freq,,,1127,864,,,,12,,,,,,,,,,,,,,,,,,,,1095
first,,,,,,,,2012-01-08 00:00:00,,,,,,,,,,,,,,,,,,,,
last,,,,,,,,2014-12-06 00:00:00,,,,,,,,,,,,,,,,,,,,
mean,5592.16,1968.81,,,52247.25,0.44,0.51,,49.11,303.94,26.3,166.95,37.53,27.06,44.02,2.33,4.08,2.66,5.79,5.32,0.07,0.07,0.07,0.06,0.01,0.15,0.01,
std,3246.66,11.98,,,25173.08,0.54,0.54,,28.96,336.6,39.77,225.72,54.63,41.28,52.17,1.93,2.78,2.92,3.25,2.43,0.26,0.26,0.26,0.25,0.11,0.36,0.1,
min,0.0,1893.0,,,1730.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25%,2828.25,1959.0,,,35303.0,0.0,0.0,,24.0,23.75,1.0,16.0,3.0,1.0,9.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


## Part 1

In [6]:
barplot = pd.DataFrame(df.groupby(["Education"])["NumWebPurchases"].sum())
barplot

Unnamed: 0_level_0,NumWebPurchases
Education,Unnamed: 1_level_1
2n Cycle,757
Basic,102
Graduation,4649
Master,1492
PhD,2150


The first figure will be a barplot comparing Web purchases among Educational level of customers

## Part 2

In [7]:
lineplot = pd.DataFrame(df.groupby(["Dt_Customer"])["NumWebPurchases"].sum())
lineplot

Unnamed: 0_level_0,NumWebPurchases
Dt_Customer,Unnamed: 1_level_1
2012-01-08,17
2012-01-09,8
2012-01-10,22
2012-01-11,6
2012-01-12,19
...,...
2014-12-02,13
2014-12-03,21
2014-12-04,2
2014-12-05,41


The second figure is a lineplot displaying time on x-axis and Number of Web purchases on y-axis to look at trend.

## Part 3

In [8]:
scatterplot = df[["Income","NumWebPurchases"]]
scatterplot

Unnamed: 0,Income,NumWebPurchases
0,84835.00,4
1,57091.00,7
2,67267.00,3
3,32474.00,1
4,21474.00,3
...,...,...
2235,66476.00,5
2236,31056.00,1
2237,46310.00,6
2238,65819.00,5


The third graph will be a scatterplot to examine any relationship with number of web purchases by income