In [1]:
# In this example we want to predict if a customer is likely to buy something or not.
# The prediction is based in the webpages that the customer has accessed.
# It is a simplification of this application that should usually join more data such as the time spent and
# the order of the navigation. But for the sake of this introduction let us consider it is possible to
# predict with good accuracy if the customer is likely to buy or not.

# We gonna use pandas for reading a file in CSV format
import pandas as pd

# uri = "https://gist.githubusercontent.com/guilhermesilveira/24e271e68afe8fd257911217b88b2e07/raw/e70287fb1dcaad4215c3f3c9deda644058a616bc/movie_metadata.csv"
uri = "https://raw.githubusercontent.com/guilhermesilveira/machine-learning/master/capitulo4/acesso.csv"
data = pd.read_csv(uri)

# This is how the data looks like
data.sample(5)

Unnamed: 0,home,como_funciona,contato,comprou
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,1,1,0,0


In [3]:
# Let us refers to the features like they are called originally in the dataset file
print("The name of the columns are: ",data.columns)

# We have three features (inputs)
x = data[["home"," como_funciona"," contato"]]
x.head()

The name of the columns are:  Index(['home', ' como_funciona', ' contato', ' comprou'], dtype='object')


Unnamed: 0,home,como_funciona,contato
0,1,1,0
1,1,1,0
2,1,1,0
3,1,1,0
4,1,1,0


In [4]:
# We one output
y = data[[" comprou"]]
y.head()

Unnamed: 0,comprou
0,0
1,0
2,0
3,0
4,0


In [5]:
# NOw we gonna train a linear model using the whole data
# sklearn.svm.LinearSVC
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(x, y)

# Here we are doing predictions, but we are asking to predict the answer of the same samples we have used to train
biased_predictions = model.predict(x)

# It is actually a bad way to do that. We can't be sure how good is our model actually.
from sklearn.metrics import accuracy_score
biased_accuracy = accuracy_score(y, biased_predictions) * 100
print("Using same data to evaluate accuracy it shows a high rate: ", biased_accuracy, "%")

Using same data to evaluate accuracy it shows a high rate:  96.96969696969697 %


  y = column_or_1d(y, warn=True)


In [6]:
# The right thing to do is to split our dataset in train and test. The test part will not be used in the training
# It will only be used to evaluate the model
print("The right thing to do is to split the dataset")
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y)

model.fit(train_x, train_y)
predictions = model.predict(test_x)
accuracy = accuracy_score(test_y, predictions) * 100
print("A more realistic rate: ", accuracy, "%")

The right thing to do is to split the dataset
A more realistic rate:  100.0 %


  y = column_or_1d(y, warn=True)
