# Parte 3
## XGBoost

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split
# Hypter-parametros
from sklearn.model_selection import RandomizedSearchCV
# Metrica 
from sklearn.metrics import roc_auc_score
# Encoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
# CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
np.random.seed(seed=27)

In [3]:
train_df = pd.read_parquet('/content/drive/MyDrive/Orga Datos/TP3/TP3-OK/train.parquet')
test_df = pd.read_parquet('/content/drive/MyDrive/Orga Datos/TP3/TP3-OK/test.parquet')

train_df.content.fillna(' ', inplace=True)
test_df.content.fillna(' ', inplace=True)

In [4]:
X = train_df[train_df.columns[2:-2]] # no incluyo la url ni las cols popular ni shares
y = train_df[train_df.columns[-2]] # col popular (la que quiero predecir)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# CountVectorizer para encodear

In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
content_list_X_train = X_train['content'].to_list()
count_vectorizer = CountVectorizer(lowercase=True, stop_words=stop_words, analyzer="word", max_features=5)

In [9]:
tf_matrix_X_train = count_vectorizer.fit_transform(content_list_X_train)
tf_array_X_train = tf_matrix_X_train.toarray()

In [10]:
most_common_words_X_train_df = pd.DataFrame(tf_array_X_train, columns=count_vectorizer.get_feature_names_out())

In [11]:
X_train = pd.concat([X_train.reset_index(), most_common_words_X_train_df.reset_index()], axis=1)

In [12]:
X_train = X_train.drop(columns=['index', 'content'])

In [13]:
content_list_X_test = X_test['content'].to_list()

In [14]:
tf_matrix_X_test = count_vectorizer.transform(content_list_X_test)
tf_array_X_test = tf_matrix_X_test.toarray()

In [15]:
most_common_words_X_test_df = pd.DataFrame(tf_array_X_test, columns=count_vectorizer.get_feature_names_out())

In [16]:
X_test = pd.concat([X_test.reset_index(), most_common_words_X_test_df.reset_index()], axis=1)

In [17]:
X_test = X_test.drop(columns=['index', 'content'])

In [18]:
# Encodeo surprise1

In [19]:
oho_surprise1_encoder = OneHotEncoder(handle_unknown='ignore')
encoded_surprise1_train = oho_surprise1_encoder.fit_transform(X_train[['surprise1']]).todense().astype(int)
encoded_surprise1_test = oho_surprise1_encoder.transform(X_test[['surprise1']]).todense().astype(int)

In [20]:
oho_categories_surprise1_X_train_df = pd.DataFrame(encoded_surprise1_train, columns=oho_surprise1_encoder.categories_)

In [21]:
X_train = pd.concat([X_train.reset_index(), oho_categories_surprise1_X_train_df], axis=1)

In [22]:
X_train = X_train.drop(columns=['index', 'surprise1'])

In [23]:
oho_categories_surprise1_X_test_df = pd.DataFrame(encoded_surprise1_test, columns=oho_surprise1_encoder.categories_)

In [24]:
X_test = pd.concat([X_test.reset_index(), oho_categories_surprise1_X_test_df], axis=1)

In [25]:
X_test = X_test.drop(columns=['index', 'surprise1'])

In [26]:
# Encodeo surprise2

In [27]:
oho_surprise2_encoder = OneHotEncoder(handle_unknown='ignore')
encoded_surprise2_train = oho_surprise2_encoder.fit_transform(X_train[['surprise2']]).todense().astype(int)
encoded_surprise2_test = oho_surprise2_encoder.transform(X_test[['surprise2']]).todense().astype(int)

In [28]:
oho_categories_surprise2_X_train_df = pd.DataFrame(encoded_surprise2_train, columns=oho_surprise2_encoder.categories_)

In [29]:
X_train = pd.concat([X_train.reset_index(), oho_categories_surprise2_X_train_df], axis=1)

In [30]:
X_train = X_train.drop(columns=['index', 'surprise2'])

In [31]:
oho_categories_surprise2_X_test_df = pd.DataFrame(encoded_surprise2_test, columns=oho_surprise2_encoder.categories_)

In [32]:
X_test = pd.concat([X_test.reset_index(), oho_categories_surprise2_X_test_df], axis=1)

In [33]:
X_test = X_test.drop(columns=['index', 'surprise2'])

In [34]:
# LLeno Nans

In [35]:
def llenar_nulos(df):
  df['n_tokens_title'].fillna(df['n_tokens_title'].mean(), inplace=True)
  df['n_tokens_content'].fillna(df['n_tokens_content'].mean(), inplace=True)
  df['n_non_stop_unique_tokens'].fillna(df['n_non_stop_unique_tokens'].mean(), inplace=True)
  df['num_hrefs'].fillna(float(df['num_hrefs'].mode()), inplace=True)
  df['num_self_hrefs'].fillna(float(df['num_self_hrefs'].mode()), inplace=True)
  df['num_imgs'].fillna(df['num_imgs'].min(), inplace=True)
  df['num_videos'].fillna(df['num_videos'].min(), inplace=True)
  df['LDA_00'].fillna(df['LDA_00'].mean(), inplace=True)
  df['LDA_01'].fillna(df['LDA_01'].mean(), inplace=True)
  df['LDA_02'].fillna(df['LDA_02'].mean(), inplace=True)
  df['LDA_03'].fillna(df['LDA_03'].mean(), inplace=True)
  df['kw_min_min'].fillna(float(df['kw_min_min'].mode()), inplace=True)
  df['kw_max_min'].fillna(float(df['kw_max_min'].mode()), inplace=True)
  df['kw_min_avg'].fillna(float(df['kw_min_avg'].mode()), inplace=True)
  df['kw_avg_min'].fillna(float(df['kw_avg_min'].mode()), inplace=True)
  df['kw_min_max'].fillna(float(df['kw_min_max'].mode()), inplace=True)
  df['kw_max_max'].fillna(float(df['kw_max_max'].mode()), inplace=True)
  df['kw_min_avg'].fillna(float(df['kw_min_avg'].mode()), inplace=True)
  df['self_reference_avg_sharess'].fillna(float(df['self_reference_avg_sharess'].mode()), inplace=True)
  df['max_positive_polarity'].fillna(df['max_positive_polarity'].max(), inplace=True)

  df.fillna(0.0, inplace=True)

In [36]:
llenar_nulos(X_train)
llenar_nulos(X_test)

In [37]:
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.2, random_state=42)

In [38]:
modeloXG = XGBClassifier()

In [39]:
param_grid = [
  { 'gamma': [1, 10, 20],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'reg_alpha': [1, 2, 4],
    'reg_lambda': [1,5,10]
  }
]
clf = RandomizedSearchCV(modeloXG, param_grid, cv = 2, n_jobs=1, n_iter=3)
best_clf = clf.fit(X_train, y_train)

In [40]:
predictions = best_clf.predict_proba(X_valid)[:,1]

In [41]:
metric = roc_auc_score(y_valid, predictions)
print("La metrica del modelo para validacion es: ", metric)

La metrica del modelo para validacion es:  0.7041113872879425
