Linear Regression (from Python Machine Learning for Beginners 6.2)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
# List the datasets in the Seaborn library
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

In [4]:
# To reada particular dataset into the Pandas dataframe, pass the datasetname to 
# the load_dataset() method of the Seaborn library
tips_df = sns.load_dataset("tips")
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
# Loads the Diamonds dataset and displays its first five rows
diamond_df = sns.load_dataset("diamonds")
diamond_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
# As afirst step, we divide the data into features and labels sets. 
# Our labels set consists of values from the “tip” column, while the features set consists of values 
# from the remaining columns
X = tips_df.drop(['tip'], axis=1)
y = tips_df['tip']

In [7]:
# Print the feature set
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [8]:
# Print the label set
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

In [9]:
# Machine learning algorithms, for the most part, can only work with numbers.Therefore, 
# it is important to convert categorical data into a numeric format.In this regard, 
# the first step is to create a dataset of all numeric values. 
# To do so, drop the categorical columns from the dataset.
numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis = 1)

In [10]:
# Shows that the dataframe “numerical” contains numericcolumns only
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [11]:
# Create a dataframe that contains only categorical columns
categorical = X.filter(['sex', 'smoker', 'day', 'time'])
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [None]:
# One of the most common approaches to convert a categorical column to a 
# numeric one is via one-hot encoding. In one-hot encoding, for every unique 
# value in the original columns, a new column is created.

# For instance, for sex, two columns: Female and Male, are created. If the 
# original sex column contained male, a 1 is added in the newly created Male 
# column, while 1 is added in the newly created Female column if the 
# original sex column contained Female.

# However, it can be noted that we do not really need two columns. A single 
# column, i.e., Female is enough since when a customer is female, we can add 1 
# in the Female column, else 1 can be added in that column. Hence,we 
# need N-1 one-hot encoded columns for all the N values in the original column.

# Let's convert categorical columns into one-hot encoded columns using 
# the pd.get_dummies() method.
cat_numerical = pd.get_dummies(categorical, drop_first=True)