In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, normalize, StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from pathlib import Path

In [44]:
tweets = pd.read_csv("data/train_pre_processing_3.csv")
test = pd.read_csv("data/test_pre_processing_3.csv")

submit = test['id'].to_frame()

In [45]:
encode_columns = ['keyword', 'location', 'text', 'keyword_grouped']

encode_tweets = tweets[encode_columns]
encode_tweets = encode_tweets.astype('str')
encode_tweets = encode_tweets.apply(LabelEncoder().fit_transform)
tweets_encode_drop = tweets.drop(encode_columns, axis = 1)
tweets_encode = pd.concat([tweets_encode_drop, encode_tweets], axis = 1)
tweets_encode.drop(axis=1, labels=['target'], inplace=True)

Y = tweets['target']

In [17]:
seed = 7
test_size = 0.20
X = normalize(tweets_encode, norm='l1', axis=0, copy=True)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [18]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [19]:
preds = knn_model.predict(X_test)

In [20]:
roc_auc_score(y_test, preds)

0.7328807212645401

In [21]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.743270


In [None]:
# ROC = 0.7328807212645401
# ACC = 0.743270
# 

## Try StandardScaler

In [46]:
scaler = StandardScaler()
scaler.fit(tweets_encode)
sc_transform = scaler.transform(tweets_encode)
X = pd.DataFrame(sc_transform)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [47]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [48]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform')

In [49]:
preds = knn_model.predict(X_test)
roc_auc_score(y_test, preds)

0.7850548106045915

In [50]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.799737


In [51]:
# ROC = 0.7850548106045915
# ACC = 0.799737

## Predecir Kaggle test set

In [52]:
knn_model.fit(X, Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=15, p=2,
           weights='uniform')

In [53]:
encode_tweets = test[encode_columns]
encode_tweets = encode_tweets.astype('str')
encode_tweets = encode_tweets.apply(LabelEncoder().fit_transform)
tweets_encode_drop = test.drop(encode_columns, axis = 1)
tweets_encode = pd.concat([tweets_encode_drop, encode_tweets], axis = 1)
tweets_encode.drop(axis=1, labels=['id'], inplace=True)

In [54]:
scaler = StandardScaler()
scaler.fit(tweets_encode)
sc_transform = scaler.transform(tweets_encode)
X = pd.DataFrame(sc_transform)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [55]:
preds = knn_model.predict(X)
submit['target'] = pd.DataFrame(preds)
submit

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


In [56]:
Path("result").mkdir(parents=True, exist_ok=True)
submit.to_csv('result/submit.csv', index=False)