In [1]:
import pandas as pd
import seaborn as sns
import matplotlib

df = pd.read_csv('./data/Fitbit2.csv')

# Create scatter plot to look at two specific columns, ‘MinutesOfBeingAwake’ and ‘NumberOfAwakings’
sns.regplot(x="MinutesOfBeingAwake", y="NumberOfAwakings", data=df)

<matplotlib.axes._subplots.AxesSubplot at 0x11a6a0320>

In [2]:
from sklearn import linear_model

# How to establish a linear mathematical equation/relation between the two variables, for predictions
lm = linear_model.LinearRegression()
X = pd.DataFrame(data= df, columns=['MinutesOfBeingAwake'])
Y = pd.DataFrame(data= df, columns=['NumberOfAwakings'])
model = lm.fit(X,Y)
lm.intercept_
lm.coef_
predictions  = lm.predict(X)

In [3]:
from sklearn.metrics import mean_squared_error, r2_score

# How to compute RMSE and R-squared 
print(mean_squared_error(Y, predictions))
r2_score(Y, predictions)

10.853805417037568


0.9059552749155962

In [4]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

# Using Python Scikit Learn to find summary()
model = ols("NumberOfAwakings~MinutesOfBeingAwake",data=df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       NumberOfAwakings   R-squared:                       0.906
Model:                            OLS   Adj. R-squared:                  0.906
Method:                 Least Squares   F-statistic:                     3516.
Date:                Fri, 12 Jul 2019   Prob (F-statistic):          1.89e-189
Time:                        09:54:45   Log-Likelihood:                -958.31
No. Observations:                 367   AIC:                             1921.
Df Residuals:                     365   BIC:                             1928.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               0.7290    

In [5]:
# How to create a model for Multiple Linear Regression (Multivariate Analysis)
Y = pd.DataFrame(data = df, columns = ['NumberOfAwakings'])
X = df[['MinutesOfBeingAwake', 'MinutesOfSleep', 'Activity Calories']]
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)
lm.intercept_
lm.coef_
predictions  = lm.predict(X)

In [6]:
print(mean_squared_error(Y, predictions))
r2_score(Y, predictions)

10.3960615412238


0.9099214780402995

In [7]:
from sklearn.preprocessing import MinMaxScaler

# How to standardize data by Scaling the range from 0 to 1, based on Minimum & Maximum values
transformer = MinMaxScaler().fit(df[['MinutesOfBeingAwake','MinutesOfSleep']])
pd.DataFrame(transformer.transform(df[['MinutesOfBeingAwake','MinutesOfSleep']])).head()

Unnamed: 0,0,1
0,0.333333,0.694394
1,0.448718,0.820976
2,0.589744,0.699819
3,0.397436,0.562387
4,0.833333,0.735986


In [8]:
from sklearn.preprocessing import Normalizer

# How to normalize data by Scaling the range from 0 to 1, using Normalizer()
transformer = Normalizer().fit(df[['MinutesOfBeingAwake','MinutesOfSleep']])
pd.DataFrame(transformer.transform(df[['MinutesOfBeingAwake','MinutesOfSleep']])).head()

Unnamed: 0,0,1
0,0.067554,0.997716
1,0.076864,0.997042
2,0.118032,0.99301
3,0.099187,0.995069
4,0.157707,0.987486


In [9]:
# How to transform Categorical column to Numerical data by using Label Encoding
pd.Categorical(df['Months']).codes

array([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2

In [10]:
# How to transform Categorical column to Numerical data by using One Hot Encoding
pd.get_dummies(df, columns=["Days"]).head() #this appends to the given dataframe

Unnamed: 0,Date,Calorie burned,Steps,Distance,Floors,Minutes Sedentary,Minutes Lightly Active,Minutes Fairly Active,Minutes Very Active,Activity Calories,...,Yesterday_sleep_efficiency,Months,Months_encoded,Days_Friday,Days_Monday,Days_Saturday,Days_Sunday,Days_Thursday,Days_Tuesday,Days_Wednesday
0,2015-05-08,1934,905,0.65,0,1.355,46,0,0,1680,...,0.0,May,5,1,0,0,0,0,0,0
1,2015-05-09,3631,18925,14.11,4,611.0,316,61,60,2248,...,92.086331,May,5,0,0,1,0,0,0,0
2,2015-05-10,3204,14228,10.57,1,602.0,226,14,77,1719,...,92.464358,May,5,0,0,0,1,0,0,0
3,2015-05-11,2673,6756,5.02,8,749.0,190,23,4,9620,...,88.761468,May,5,0,1,0,0,0,0,0
4,2015-05-12,2495,502,3.73,1,876.0,171,0,0,7360,...,88.857143,May,5,0,0,0,0,0,1,0


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

churnData = pd.read_csv('./data/Customer-Churn.csv')

# How to make Logistic Regression Model using Python
numericData = churnData[['tenure','SeniorCitizen','MonthlyCharges']]
Y = pd.DataFrame(data=churnData, columns=['Churn'])
transformer = StandardScaler().fit(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(scaled_x, churnData['Churn'])
classification.score(scaled_x, churnData['Churn'])

0.7911401391452506