### Lineal Regresion : Con variables categoricas

In [45]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('Ecom Expense.csv')

In [7]:
df.head()

Unnamed: 0,Transaction ID,Age,Items,Monthly Income,Transaction Time,Record,Gender,City Tier,Total Spend
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485


In [11]:
dummy_gender = pd.get_dummies(df['Gender'], prefix="Gender")
dummy_city = pd.get_dummies(df['City Tier'], prefix="City")

In [12]:
dummy_gender

Unnamed: 0,Gender_Female,Gender_Male
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
5,0,1
6,0,1
7,0,1
8,0,1
9,1,0


In [13]:
dummy_city

Unnamed: 0,City_Tier 1,City_Tier 2,City_Tier 3
0,1,0,0
1,0,1,0
2,0,1,0
3,1,0,0
4,0,1,0
5,0,1,0
6,1,0,0
7,0,0,1
8,1,0,0
9,0,1,0


In [17]:
columns_names = df.columns.values

In [18]:
columns_names

array(['Transaction ID', 'Age ', ' Items ', 'Monthly Income',
       'Transaction Time', 'Record', 'Gender', 'City Tier', 'Total Spend'],
      dtype=object)

In [21]:
df_new = df[columns_names].join(dummy_gender).join(dummy_city)


In [23]:
df_new.columns.values

array(['Transaction ID', 'Age ', ' Items ', 'Monthly Income',
       'Transaction Time', 'Record', 'Gender', 'City Tier', 'Total Spend',
       'Gender_Female', 'Gender_Male', 'City_Tier 1', 'City_Tier 2',
       'City_Tier 3'], dtype=object)

In [26]:
names= ['trans_id', 'age', 'items', 'monthly_income',
       'trans_time', 'record', 'gender', 'city_tier', 'total_spend',
       'gender_female', 'gender_male', 'city_tier_1', 'city_tier_2',
       'city_tier_3']

In [32]:
names

['trans_id',
 'age',
 'items',
 'monthly_income',
 'trans_time',
 'record',
 'gender',
 'city_tier',
 'total_spend',
 'gender_female',
 'gender_male',
 'city_tier_1',
 'city_tier_2',
 'city_tier_3']

In [34]:
df_new.rename_axis?

[0;31mSignature:[0m
[0mdf_new[0m[0;34m.[0m[0mrename_axis[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmapper[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Set the name of the axis for the index or columns.

Parameters
----------
mapper : scalar, list-like, optional
    Value to set the axis name attribute.
index, columns : scalar, list-like, dict-like or function, optional
    A scalar, list-like, dict-like or functions transformations to
    apply to that axis' values.

    Use either ``mapper`` and ``axis``

In [37]:
df_new.columns = names

In [39]:
df_new.head()

Unnamed: 0,trans_id,age,items,monthly_income,trans_time,record,gender,city_tier,total_spend,gender_female,gender_male,city_tier_1,city_tier_2,city_tier_3
0,TXN001,42,10,7313,627.668127,5,Female,Tier 1,4198.385084,1,0,1,0,0
1,TXN002,24,8,17747,126.904567,3,Female,Tier 2,4134.976648,1,0,0,1,0
2,TXN003,47,11,22845,873.469701,2,Male,Tier 2,5166.614455,0,1,0,1,0
3,TXN004,50,11,18552,380.219428,7,Female,Tier 1,7784.447676,1,0,1,0,0
4,TXN005,60,2,14439,403.374223,2,Female,Tier 2,3254.160485,1,0,0,1,0


In [74]:
feature_cols = ['monthly_income','trans_time', 
                'city_tier_1','city_tier_2',
                'city_tier_3','gender_female',
                'gender_male', 'record', 'age']

In [75]:
data_training, data_testing = train_test_split(df_new, test_size=0.2)

x = data_training[feature_cols]
y = data_training['total_spend']

x_test = data_testing[feature_cols]
y_test = data_testing['total_spend']

In [76]:
estimator = SVR(kernel ="linear")
selector = RFE(estimator, 6, step=1)
selector = selector.fit(x,y)

In [77]:
selector.ranking_

array([4, 3, 1, 2, 1, 1, 1, 1, 1])

In [78]:
selector.support_

array([False, False,  True, False,  True,  True,  True,  True,  True])

In [79]:
lm = LinearRegression()

In [80]:
lm.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [81]:
lm.score(x,y)

0.9203660427025878

In [82]:
lm.coef_

array([ 1.46932562e-01,  1.60586414e-01,  7.08152880e+01,  4.97981593e+01,
       -1.20613447e+02, -1.25367309e+02,  1.25367309e+02,  7.77433487e+02,
        7.44006562e+00])

In [83]:
list(zip(feature_cols,lm.coef_))

[('monthly_income', 0.14693256163530682),
 ('trans_time', 0.1605864142709308),
 ('city_tier_1', 70.81528802784827),
 ('city_tier_2', 49.798159315432336),
 ('city_tier_3', -120.61344734328048),
 ('gender_female', -125.36730921360721),
 ('gender_male', 125.36730921360726),
 ('record', 777.4334869785355),
 ('age', 7.44006562296276)]

In [84]:
y_predict = lm.predict(x_test)


In [86]:
y_predict

array([ 2807.7311142 ,  7591.13480973,  5143.66057849,  3528.55524561,
        9455.13981052,  7027.00758952, 10061.59078591,  8934.81333249,
        6524.802319  ,  7395.34003912,   320.5500255 ,  5970.42232205,
        5482.0947821 ,  3939.13982208,   738.68054041,  4143.23031716,
        2402.9746185 ,  5921.56324986,   520.40164116, 10795.6262443 ,
        2103.04637943,  3532.48346872, 10854.90803407,  3545.57151359,
        7909.42360879,  8055.41559624,  6290.20541302,  6714.71515479,
        6823.61598408,  6235.48813443,  4377.73976786,  3708.82555857,
        4467.51655104,  3933.26019162, 10551.56275164,  1261.76713609,
        2797.15355425,  8613.54517034,  6802.70344774,  5978.42033064,
        5869.70217304,  2975.94080514, 10296.35123614,  4207.763769  ,
        2239.2685616 ,  6196.32417629,  8869.81432614,  4672.83573731,
       10582.77892998,  9091.02024284,  3628.86534183,  6921.09914786,
        2861.93894283,  4929.99181656,  2392.09135889,  1042.88984549,
      

In [87]:
y_test

2235     3681.191036
110      7577.553893
2128     4199.645233
1418     3762.170217
1326     9152.162648
202      6520.705718
1396    10014.223480
1463    10607.928370
2280     5510.380599
128      7047.271755
2056     1668.656531
1754     4898.655289
1361     4852.033309
818      4074.507023
986      2060.517109
438      4917.364558
2124     2964.479679
1523     4940.052481
1867     2096.056522
1508    12222.917370
237      2536.860070
1563     2904.691439
561     11438.186360
1479     3260.127845
2321     7766.398004
752      7722.235717
1134     5543.379308
1816     5892.138615
689      6307.362017
256      5616.825494
            ...     
2030     2744.804564
2275     6365.174538
113      9423.882179
2008     8272.009559
1213     9403.728361
1222     9405.614360
138      3922.537837
1034     8272.388638
578      5273.844200
1043     4497.244784
1491     9654.515641
1230     4884.147231
1721     5673.601917
725     11831.355300
1106     3293.522467
1172     3583.357677
114      4914

In [93]:
ssd = np.sum((y_predict-y_test)**2)

In [94]:
ssd

310662700.8145219

In [95]:
rse = np.sqrt(ssd/len(x_test)-len(feature_cols)-1)

In [96]:
rse


810.4209889383947

In [97]:
total_spend_mean = np.mean(y_test)

In [99]:
total_spend_mean

6077.5770132262105

In [100]:
error = rse/total_spend_mean

In [101]:
error

0.13334606656151482

In [102]:
len(x_test)

473

In [104]:
df_new.corr()

Unnamed: 0,age,items,monthly_income,trans_time,record,total_spend,gender_female,gender_male,city_tier_1,city_tier_2,city_tier_3
age,1.0,0.016626,0.010164,-0.007045,0.003397,0.03361,0.026996,-0.026996,-0.015177,0.028467,-0.013209
items,0.016626,1.0,-0.013364,-0.020551,0.012683,0.067042,-0.006959,0.006959,0.035742,0.006314,-0.042659
monthly_income,0.010164,-0.013364,1.0,-0.044855,0.018491,0.438623,-0.005637,0.005637,-0.015803,-0.004128,0.020205
trans_time,-0.007045,-0.020551,-0.044855,1.0,-0.005044,-0.00561,-0.038704,0.038704,0.006533,-0.005512,-0.001094
record,0.003397,0.012683,0.018491,-0.005044,1.0,0.857675,0.015554,-0.015554,0.012696,-0.021404,0.008628
total_spend,0.03361,0.067042,0.438623,-0.00561,0.857675,1.0,-0.036481,0.036481,0.023415,-0.00664,-0.017109
gender_female,0.026996,-0.006959,-0.005637,-0.038704,0.015554,-0.036481,1.0,-1.0,0.003662,-0.001277,-0.002436
gender_male,-0.026996,0.006959,0.005637,0.038704,-0.015554,0.036481,-1.0,1.0,-0.003662,0.001277,0.002436
city_tier_1,-0.015177,0.035742,-0.015803,0.006533,0.012696,0.023415,0.003662,-0.003662,1.0,-0.510633,-0.502356
city_tier_2,0.028467,0.006314,-0.004128,-0.005512,-0.021404,-0.00664,-0.001277,0.001277,-0.510633,1.0,-0.486915
