In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from pandas.plotting import scatter_matrix
%matplotlib inline

In [4]:
df = pd.read_csv('balance.txt')
df.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [5]:
df.columns

Index(['Unnamed: 0', 'Income', 'Limit', 'Rating', 'Cards', 'Age', 'Education',
       'Gender', 'Student', 'Married', 'Ethnicity', 'Balance'],
      dtype='object')

In [7]:
df.corr()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
Unnamed: 0,1.0,0.037203,0.024172,0.021985,-0.036304,0.058603,-0.001415,0.006064
Income,0.037203,1.0,0.792088,0.791378,-0.018273,0.175338,-0.027692,0.463656
Limit,0.024172,0.792088,1.0,0.99688,0.010231,0.100888,-0.023549,0.861697
Rating,0.021985,0.791378,0.99688,1.0,0.053239,0.103165,-0.030136,0.863625
Cards,-0.036304,-0.018273,0.010231,0.053239,1.0,0.042948,-0.051084,0.086456
Age,0.058603,0.175338,0.100888,0.103165,0.042948,1.0,0.003619,0.001835
Education,-0.001415,-0.027692,-0.023549,-0.030136,-0.051084,0.003619,1.0,-0.008062
Balance,0.006064,0.463656,0.861697,0.863625,0.086456,0.001835,-0.008062,1.0


In [8]:
y = df.Balance
y.head()

0    333
1    903
2    580
3    964
4    331
Name: Balance, dtype: int64

In [9]:
x = df.drop(columns=['Balance'])
x.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian


In [10]:
x = pd.get_dummies(x)
x.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender_ Male,Gender_Female,Student_No,Student_Yes,Married_No,Married_Yes,Ethnicity_African American,Ethnicity_Asian,Ethnicity_Caucasian
0,1,14.891,3606,283,2,34,11,1,0,1,0,0,1,0,0,1
1,2,106.025,6645,483,3,82,15,0,1,0,1,0,1,0,1,0
2,3,104.593,7075,514,4,71,11,1,0,1,0,1,0,0,1,0
3,4,148.924,9504,681,3,36,11,0,1,1,0,1,0,0,1,0
4,5,55.882,4897,357,2,68,16,1,0,1,0,0,1,0,0,1


In [11]:
x = x.drop(columns=['Gender_Female','Student_No', 'Married_No', 'Ethnicity_Asian'])
x.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender_ Male,Student_Yes,Married_Yes,Ethnicity_African American,Ethnicity_Caucasian
0,1,14.891,3606,283,2,34,11,1,0,1,0,1
1,2,106.025,6645,483,3,82,15,0,1,1,0,0
2,3,104.593,7075,514,4,71,11,1,0,0,0,0
3,4,148.924,9504,681,3,36,11,0,0,0,0,0
4,5,55.882,4897,357,2,68,16,1,0,1,0,1


In [12]:
x = sm.add_constant(x)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [14]:
x_train.shape, x_test.shape

((268, 13), (132, 13))

In [15]:
model = sm.OLS(y_train, x_train)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.955
Model:,OLS,Adj. R-squared:,0.953
Method:,Least Squares,F-statistic:,450.6
Date:,"Wed, 18 Jul 2018",Prob (F-statistic):,5.3700000000000005e-164
Time:,15:56:15,Log-Likelihood:,-1615.2
No. Observations:,268,AIC:,3256.0
Df Residuals:,255,BIC:,3303.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-477.4417,46.481,-10.272,0.000,-568.978,-385.906
Unnamed: 0,0.0195,0.058,0.338,0.736,-0.094,0.133
Income,-7.5929,0.310,-24.465,0.000,-8.204,-6.982
Limit,0.2216,0.041,5.454,0.000,0.142,0.302
Rating,0.6277,0.610,1.029,0.304,-0.573,1.829
Cards,21.3910,5.356,3.994,0.000,10.844,31.938
Age,-0.5304,0.385,-1.378,0.169,-1.288,0.227
Education,-0.5830,2.067,-0.282,0.778,-4.653,3.487
Gender_ Male,22.7738,12.847,1.773,0.077,-2.525,48.073

0,1,2,3
Omnibus:,18.586,Durbin-Watson:,2.018
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.032
Skew:,0.686,Prob(JB):,2.71e-05
Kurtosis:,3.006,Cond. No.,39900.0
