# El tratamiento de las variables categoricas

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("../datasets/ecom-expense/Ecom Expense.csv")

In [3]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [4]:
dummy_gender = pd.get_dummies(df["Gender"],prefix="Gender")
dummy_city_tier = pd.get_dummies(df["City Tier"],prefix="City")

In [5]:
dummy_gender.head()

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [6]:
dummy_city_tier.head()

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0


In [7]:
column_names = df.columns.values.tolist()
column_names

['Transaction ID',
 'Age ',
 ' Items ',
 'Monthly Income',
 'Transaction Time',
 'Record',
 'Gender',
 'City Tier',
 'Total Spend']

In [8]:
df_new = df[column_names].join(dummy_gender)
column_names = df_new.columns.values.tolist()
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0


In [9]:
df_new = df_new[column_names].join(dummy_city_tier)
df_new.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend,Gender_Female,Gender_Male,City_Tier 1,City_Tier 2,City_Tier 3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [49]:
feature_cols = ["Monthly Income",
                "Transaction Time",
                "Gender_Female",
                "Gender_Male",
                "City_Tier 1",
                "City_Tier 2",
                "City_Tier 3",
                "Record",
                "Age ",
                " Items ",
               ]

In [50]:
X = df_new[feature_cols]
Y = df_new["Total Spend"]

In [51]:
lm = LinearRegression()
lm.fit(X,Y)

LinearRegression()

In [52]:
lm.intercept_

-643.7731838035506

In [53]:
lm.coef_

array([ 1.47731711e-01,  1.69048699e-01, -1.31627781e+02,  1.31627781e+02,
        6.98809779e+01,  5.04144467e+01, -1.20295425e+02,  7.71442188e+02,
        6.18968657e+00,  3.93442110e+01])

In [54]:
list(zip(feature_cols, lm.coef_))

[('Monthly Income', 0.14773171095343043),
 ('Transaction Time', 0.16904869866853112),
 ('Gender_Female', -131.62778138807408),
 ('Gender_Male', 131.62778138807408),
 ('City_Tier 1', 69.88097793487164),
 ('City_Tier 2', 50.41444665204421),
 ('City_Tier 3', -120.29542458691608),
 ('Record', 771.4421878591879),
 ('Age ', 6.189686573746982),
 (' Items ', 39.34421096996615)]

In [55]:
lm.score(X,Y)

0.922356824990408

El modelo puede ser escrito como:
    Total_Spend = lm.intercept_ * (n) Variables