* **This notebook explains very simply how to use principal component analysis with python. This process can visualize a high-dimensional dataset and reduce the processing cost.**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

In [2]:
dataset = pd.read_csv('marketing_campaign.csv', sep=';')

In [3]:
dataset.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [5]:
dataset.dropna(inplace = True)

### Convert categorical data to numerical data for processing

In [6]:
dataset["Dt_Customer"] = pd.to_datetime(dataset["Dt_Customer"])
dates = []
for i in dataset["Dt_Customer"]:
    i = i.date()
    dates.append(i)  

print("Registration date of the newest customer on record:",max(dates))
print("Registration date of the oldest customer on record:",min(dates))

Registration date of the newest customer on record: 2014-06-29
Registration date of the oldest customer on record: 2012-07-30


In [7]:
days = []
d1 = max(dates)
for i in dates:
    delta = d1 - i
    days.append(delta)
dataset["Customer_For"] = days
dataset["Customer_For"] = pd.to_numeric(dataset["Customer_For"], errors="coerce")

In [8]:
dataset["Living_With"] = dataset["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone",})

In [9]:
dataset["Education"] = dataset["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Graduate", "PhD":"Graduate"})

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
dataset['Education'] = dataset[['Education']].apply(le.fit_transform)
dataset['Living_With'] = dataset[['Living_With']].apply(le.fit_transform)

In [12]:
to_drop = ["Marital_Status", "Dt_Customer", "ID", ]
dataset = dataset.drop(to_drop, axis=1)

### We are ready!

In [13]:
dataset.dtypes

Year_Birth               int64
Education                int64
Income                 float64
Kidhome                  int64
Teenhome                 int64
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Z_CostContact            int64
Z_Revenue                int64
Response                 int64
Customer_For             int64
Living_With              int64
dtype: object

In [14]:
dataset.shape

(2216, 28)

### Data is compressed into a specific range to reduce the cost of operations.

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaled_data = sc.fit_transform(dataset)


### The data is reduced to 3 dimensions.

In [16]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit(scaled_data)
pca_data = pd.DataFrame(pca.transform(scaled_data), columns=(["col1","col2", "col3"]))
pca_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col1,2216.0,2.276558e-16,2.562114,-5.149307,-2.312023,-0.667787,2.065974,7.672163
col2,2216.0,-5.1803730000000005e-17,1.45874,-3.542499,-1.13293,-0.23439,0.99871,5.045039
col3,2216.0,-3.9078250000000005e-17,1.385927,-2.943645,-0.786237,-0.151844,0.388307,8.424851


In [22]:
pca_data.head()

Unnamed: 0,col1,col2,col3
0,3.861984,0.50633,0.119815
1,-2.398971,-0.200292,-0.342626
2,1.612362,-0.100465,-1.123985
3,-2.555988,-1.451764,0.15183
4,-0.492125,-0.304219,-0.699469


### Data reduced to 3 dimensions are visualized.

In [18]:
Scene = dict(xaxis = dict(title  = 'Col1'),yaxis = dict(title  = 'Col2'),zaxis = dict(title  = 'Col3'))
trace = go.Scatter3d(x=pca_data['col1'], y=pca_data['col2'], z=pca_data['col3'], mode='markers',marker=dict(color = 'blue', size= 10, line=dict(color= 'black',width = 10)))
layout = go.Layout(margin=dict(l=0,r=0),scene = Scene,height = 800,width = 800)
data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.show()

**The visualization has been added as an image because the Plotly library has a display issue on Github.**
![](https://github.com/emreyesilyurt/dimension_reduction/blob/master/assets/3d.png?raw=true)