In [1]:
import re
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import  r2_score, accuracy_score, mean_squared_error

<h1>Pipeline Projeto N.E.M<h1>
<h2>Implementação de Gradient Boosting Trees<h2>

<h3>Lendo a Base <h3>

In [2]:
raw_df = pd.read_csv("readingWithSpeed.csv")
print(raw_df.shape)
raw_df.head()

(2832, 15)


Unnamed: 0,Age,Sex,Race,Marital status?,Education,Employement,Incomes,How many books did you read during last 12months?,Read any printed books during last 12months?,Read any audiobooks during last 12months?,Read any e-books during last 12months?,"Last book you read, you…",Do you happen to read any daily news or newspapers?,Do you happen to read any magazines or journals?,Reading Speed
0,66,Male,Refused,Divorced,College graduate,Retired,"$20,000 to under $30,000",97,Yes,No,Yes,Purchased the book,No,Yes,317
1,46,Male,Native American/American Indian,Married,High school graduate,Employed full-time,"Less than $10,000",97,Yes,Yes,Yes,Purchased the book,Yes,Yes,253
2,32,Male,Mixed race,Never been married,High school graduate,Employed full-time,"Less than $10,000",97,No,Yes,Yes,Borrowed the book from a friend or family member,Yes,Yes,262
3,27,Male,Mixed race,Married,High school graduate,Employed full-time,"$40,000 to under $50,000",97,Yes,No,Yes,Borrowed the book from a library,Yes,No,255
4,16,Female,Mixed race,Never been married,High school incomplete,Employed part-time,"$10,000 to under $20,000",97,Yes,Yes,No,Purchased the book,Yes,No,208


In [3]:
raw_df = raw_df.dropna()
raw_df = raw_df.drop(columns = ['Race', 'Last book you read, you…', 'Incomes'])
raw_df.head()

Unnamed: 0,Age,Sex,Marital status?,Education,Employement,How many books did you read during last 12months?,Read any printed books during last 12months?,Read any audiobooks during last 12months?,Read any e-books during last 12months?,Do you happen to read any daily news or newspapers?,Do you happen to read any magazines or journals?,Reading Speed
0,66,Male,Divorced,College graduate,Retired,97,Yes,No,Yes,No,Yes,317
1,46,Male,Married,High school graduate,Employed full-time,97,Yes,Yes,Yes,Yes,Yes,253
2,32,Male,Never been married,High school graduate,Employed full-time,97,No,Yes,Yes,Yes,Yes,262
3,27,Male,Married,High school graduate,Employed full-time,97,Yes,No,Yes,Yes,No,255
4,16,Female,Never been married,High school incomplete,Employed part-time,97,Yes,Yes,No,Yes,No,208


In [4]:
raw_df = raw_df.rename(columns={
    "How many books did you read during last 12months?"   : "n_books",
    "Read any printed books during last 12months?"        : "printed_books",
    "Read any audiobooks during last 12months?"           : "audiobooks",
    "Read any e-books during last 12months?"              : "e_books",
    "Do you happen to read any daily news or newspapers?" : "reads_newspaper",
    "Do you happen to read any magazines or journals?"    : "reads_magazines_journals"
})

raw_df = raw_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
print(raw_df.columns)

Index(['Age', 'Sex', 'Maritalstatus', 'Education', 'Employement', 'n_books',
       'printed_books', 'audiobooks', 'e_books', 'reads_newspaper',
       'reads_magazines_journals', 'ReadingSpeed'],
      dtype='object')


<h3>Limpeza e Hot Encodng<h3>

In [5]:
for column in ['printed_books','audiobooks','e_books','reads_newspaper', 'reads_magazines_journals']:
    raw_df[column] = raw_df[column].replace({'Yes' : 1, 'No': 0, 'Don’t know' : 0})

columns_dont_know = raw_df.columns.drop(['Age', 'n_books', 'Sex'])

raw_df['Maritalstatus'] = raw_df['Maritalstatus'].apply(lambda x: x if x != 'Don’t know' else 'Never been married')

raw_df = pd.get_dummies(raw_df)
raw_df = raw_df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [6]:
y = raw_df['ReadingSpeed']
x = raw_df.drop(columns = ['ReadingSpeed'])

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 584)

<blockquote>Função genérica que printa os resulltados</blockquote>

In [7]:
def print_results(predictions, y_testy):
    errors = abs(predictions - y_testy)
    sqr_error = abs(predictions - y_testy) ** 2
    out_mape = round(np.mean(errors), 2)
    out_mse = round(np.mean(sqr_error), 2)
    print('Mean Absolute Error:', out_mape, 'degrees.')
    print('Mean squared  Error:', out_mse, 'degrees.')
    r2 = r2_score(predictions, y_testy)
    print('R2 Score', r2, '%.')

    return r2, out_mape, out_mse

<h3>Gradient Boosting<h3>
<h4>Pega os parametros do encontrados no Notebook de testes e faz o fit<h4>

In [8]:
import json

with open('params.json', 'r') as fp:
    best_params_ = json.load(fp)

<b>Treinando a base<b>

In [9]:
%%time
gbm = lgb.LGBMRegressor(**best_params_)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

y_pred = gbm.predict(X_test)

[1]	valid_0's l1: 28.811	valid_0's l2: 1275
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 27.69	valid_0's l2: 1178.31
[3]	valid_0's l1: 26.4093	valid_0's l2: 1088.33
[4]	valid_0's l1: 24.412	valid_0's l2: 936.431
[5]	valid_0's l1: 22.6292	valid_0's l2: 811.202
[6]	valid_0's l1: 21.1496	valid_0's l2: 722.09
[7]	valid_0's l1: 19.6357	valid_0's l2: 632.139
[8]	valid_0's l1: 18.6538	valid_0's l2: 577.904
[9]	valid_0's l1: 17.4281	valid_0's l2: 509.848
[10]	valid_0's l1: 16.3024	valid_0's l2: 452.188
[11]	valid_0's l1: 15.258	valid_0's l2: 396.838
[12]	valid_0's l1: 14.2566	valid_0's l2: 346.414
[13]	valid_0's l1: 13.3553	valid_0's l2: 306.196
[14]	valid_0's l1: 12.6401	valid_0's l2: 274.7
[15]	valid_0's l1: 11.9064	valid_0's l2: 245.707
[16]	valid_0's l1: 11.2894	valid_0's l2: 222.178
[17]	valid_0's l1: 10.6744	valid_0's l2: 198.825
[18]	valid_0's l1: 10.1745	valid_0's l2: 182.599
[19]	valid_0's l1: 9.82209	valid_0's l2: 168.891
[20]	valid_0's l1: 9.41697	va

<b>Em uma solução real esse dado ja viria do front<b>

In [14]:
user_ = pd.read_csv("readingWithSpeed.csv").iloc[[0]]

<b>Metodo da API que retornaria a velociade de leitura<b>

In [11]:
# [Get("/powerReader/api/ml/getReadingSpeedGivenUser")]
def get_reading_speed_given_user(user):
    return gbm.predict(user)