**Autor:** __Tiago Dias__

# Desafio 7 TNT

### Importando bibliotecas

In [4]:
!pip install paho-mqtt



In [25]:
import psycopg2
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.cm as cm
from time import time, gmtime

### Extração de dados API

In [6]:
import paho.mqtt.subscribe as subscribe

topics = ['tnt']
HOST = 'HOST'
PORT = 30573
USERNAME = 'USERNAME'
PASSWORD = 'PASSWORD'
KEEPALIVE = 600

msg = subscribe.simple(topics,
                       msg_count=17000,
                       hostname=HOST, 
                       port=PORT, 
                       auth={'username':USERNAME, 'password':PASSWORD})

In [7]:
len(msg)

17000

In [8]:
msg[0].payload

b'{"Tempo": "2019-12-4", "Esta\xc3\xa7\xc3\xa3o": "Rep\xc3\xbablica", "LAT": "-23.5436", "LONG": "-46.6434", "Movimenta\xc3\xa7\xc3\xa3o": "110891", "Original_473": "52", "Original_269": "15", "Zero": "13", "Ma\xc3\xa7\xc3\xa3-Verde": "25", "Tangerina": "8", "Citrus": "6", "A\xc3\xa7a\xc3\xad-Guaran\xc3\xa1": "37", "P\xc3\xaassego": "34", "TARGET": "REABASTECER", "row": 7174}'

In [9]:
msg[1].payload

b'{"Tempo": "2019-5-23", "Esta\xc3\xa7\xc3\xa3o": "Ana Rosa", "LAT": "-23.5813", "LONG": "-46.6383", "Movimenta\xc3\xa7\xc3\xa3o": "64404", "Original_473": "86", "Original_269": "20", "Zero": "11", "Ma\xc3\xa7\xc3\xa3-Verde": "43", "Tangerina": "37", "Citrus": "8", "A\xc3\xa7a\xc3\xad-Guaran\xc3\xa1": "16", "P\xc3\xaassego": "12", "TARGET": "NORMAL", "row": 12023}'

### Transformação dos dados

In [11]:
import ast

lista_dict = []

for i in range((len(msg))):
  dict_str = msg[i].payload.decode("UTF-8")
  mydata = ast.literal_eval(dict_str)
  lista_dict.append(mydata)

In [12]:
df = pd.DataFrame(data=lista_dict)
df.shape

(17000, 15)

In [13]:
df.to_csv('datasetTNT.csv', index=False)

### Exploração dos dados

In [14]:
df.shape

(17000, 15)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Tempo         17000 non-null  object
 1   Estação       17000 non-null  object
 2   LAT           17000 non-null  object
 3   LONG          17000 non-null  object
 4   Movimentação  17000 non-null  object
 5   Original_473  17000 non-null  object
 6   Original_269  17000 non-null  object
 7   Zero          17000 non-null  object
 8   Maçã-Verde    17000 non-null  object
 9   Tangerina     17000 non-null  object
 10  Citrus        17000 non-null  object
 11  Açaí-Guaraná  17000 non-null  object
 12  Pêssego       17000 non-null  object
 13  TARGET        17000 non-null  object
 14  row           17000 non-null  int64 
dtypes: int64(1), object(14)
memory usage: 1.9+ MB


In [16]:
df.describe()

Unnamed: 0,row
count,17000.0
mean,8699.879235
std,4880.730049
min,1.0
25%,4444.0
50%,8933.5
75%,12885.5
max,17016.0


In [17]:
df.columns

Index(['Tempo', 'Estação', 'LAT', 'LONG', 'Movimentação', 'Original_473',
       'Original_269', 'Zero', 'Maçã-Verde', 'Tangerina', 'Citrus',
       'Açaí-Guaraná', 'Pêssego', 'TARGET', 'row'],
      dtype='object')

In [18]:
todas_colunas = list(df.columns)
colunas_int = ['Original_473','Original_269','Zero','Maçã-Verde','Tangerina','Citrus','Açaí-Guaraná','Pêssego']

In [20]:
df2 = df.copy()

for col in todas_colunas:
  if col in colunas_int:
    df2[col] = pd.to_numeric(df[col], downcast='integer')
  else:
    df2[col] = df[col].copy()

In [21]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Tempo         17000 non-null  object
 1   Estação       17000 non-null  object
 2   LAT           17000 non-null  object
 3   LONG          17000 non-null  object
 4   Movimentação  17000 non-null  object
 5   Original_473  17000 non-null  int8  
 6   Original_269  17000 non-null  int8  
 7   Zero          17000 non-null  int8  
 8   Maçã-Verde    17000 non-null  int8  
 9   Tangerina     17000 non-null  int8  
 10  Citrus        17000 non-null  int8  
 11  Açaí-Guaraná  17000 non-null  int8  
 12  Pêssego       17000 non-null  int8  
 13  TARGET        17000 non-null  object
 14  row           17000 non-null  int64 
dtypes: int64(1), int8(8), object(6)
memory usage: 1.0+ MB


In [22]:
df2['Estação'].value_counts()

São Judas                 793
Luz                       790
Butantã                   785
Fradique Coutinho         779
Ana Rosa                  778
Brooklin                  766
Campo Belo                764
São Joaquim               764
Higienópolis-Mackenzie    738
Vila Mariana              729
Faria Lima                717
Pinheiros                 715
Consolação                701
República                 700
Sé                        699
Eucaliptos                689
Brigadeiro                688
Paraíso                   686
Trianon-Masp              686
Tamanduateí               677
Brás                      677
Tatuapé                   622
Barra Funda               621
Moema                     318
Hospital São Paulo        118
Name: Estação, dtype: int64

In [23]:
df2['Movimentação'].value_counts()

64459    7
22921    7
24568    7
35332    7
23733    7
        ..
46601    1
67085    1
23277    1
66049    1
29196    1
Name: Movimentação, Length: 9934, dtype: int64

In [24]:
df2['TARGET'].value_counts()

NORMAL         11535
REABASTECER     5465
Name: TARGET, dtype: int64

In [26]:
le = LabelEncoder()
df2['Estação'] = le.fit_transform(df2['Estação'])

In [27]:
features = ['Estação','Original_473','Original_269','Zero','Maçã-Verde','Tangerina','Citrus','Açaí-Guaraná','Pêssego']

In [28]:
X = df2[features]
y = df2.TARGET

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [36]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
y_pred = clf.predict(X_test)

In [38]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      NORMAL       0.95      0.89      0.92      2289
 REABASTECER       0.79      0.90      0.85      1111

    accuracy                           0.89      3400
   macro avg       0.87      0.90      0.88      3400
weighted avg       0.90      0.89      0.89      3400

