In [29]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk as nltk
import tqdm as tqdm


In [30]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [31]:
#Let's see the percentage of target == 1 and target == 0
print(df.target.value_counts())
print(df.shape)

target
0    4342
1    3271
Name: count, dtype: int64
(7613, 5)


In [32]:
#rate of missing values
df.isnull().sum()/len(df)*100

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

Lots of missing values in location (33%). in keyword only 0.8% missing values.

In [33]:
df['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [34]:
df['location'].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

In [35]:
#Remove one row of the duplicated ones:
df = df.drop_duplicates(subset=['text'])

In [36]:
df.shape

(7503, 5)

In [37]:
#We can start extracting some information from the text column. We create a new column called 'text length' which is the number of characters in the text column.
df['text length'] = df['text'].apply(len)
df.head()

Unnamed: 0,id,keyword,location,text,target,text length
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88


In [38]:
#We also create a new column called 'word count' which is the number of words in the text column.
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))
df.head()

Unnamed: 0,id,keyword,location,text,target,text length,word_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16


In [39]:
#include feature called 'has_hashtag' which is 1 if the tweet contains a hashtag and 0 otherwise.
df['has_hashtag'] = df['text'].apply(lambda x: 1 if '#' in str(x) else 0)

#We also include a feature called 'has_at' which is 1 if the tweet contains an @ and 0 otherwise.
df['has_at'] = df['text'].apply(lambda x: 1 if '@' in str(x) else 0)

#We create a new column called 'has_link' which is 1 if the tweet contains a link and 0 otherwise.
df['has_link'] = df['text'].apply(lambda x: 1 if 'http' in str(x) else 0)

df.head()

Unnamed: 0,id,keyword,location,text,target,text length,word_count,has_hashtag,has_at,has_link
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69,13,1,0,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,7,0,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,133,22,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8,1,0,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,16,1,0,0


In [40]:
#We pick the most common keyword for the missing values.
df['keyword'].fillna('fatalities', inplace=True) #A fatality includes lots of disasters

Let's study the missing values in location.

In [42]:
#We will use a library to know the country of each location

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent")

# #We create a function that returns the country of a location
def get_country(location):
    if location == 'Unknown':
        return 'Unknown'
    try:
        country = geolocator.geocode(location).raw['display_name'].split(',')[-1]
        return country
    except Exception as e:
        print(f"Error occurred while geocoding location: {location}")
        #print(f"Error message: {str(e)}")
        print(f"Index: {int(df[df['location'] == location].index[0])/len(df)*100}")
        return 'Unknown'

# #change NaN to Unknown
# df['location'].fillna('Unknown', inplace=True)
# df['country'] = df['location'].apply(get_country)
# #we drop df['location'] because we don't need it anymore
df.drop('location', axis=1, inplace=True)
# df['country'].to_csv('country.csv', index=False)


Now we can remove df['location'] columns as there are 33% missing values.

In [114]:
#We will compute a further analysis on each text. We will analyze the sentiment of each text. We will use transformers and pipelines to do this.
#We will use the transformers library to do this. We will use the pre-trained sentiment analysis model from Hugging Face.

from transformers import pipeline
# from tqdm import tqdm
classifier = pipeline('sentiment-analysis')

def get_sentiment(x):
  label = classifier(x)[0]['label']
  print(f"Index: {int(df[df['text'] == x].index[0])/len(df)*100}")
  if label == 'NEGATIVE':
    return 0
  else:
    return 1
#We will create a new column called 'sentiment' which will contain the sentiment of each text.
#df['sentiment'] = df['text'].apply(lambda x: get_sentiment(x))
# df.head()
#df['sentiment'].to_csv('sentiment.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm





No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


It makes no sense to find the sentiment of each keyword, as they will be negative (they are disasters).

In [115]:
#Now we will analyze the formality of each text. We will use the pre-trained formality model from Hugging Face.
formality = pipeline('text-classification', model="s-nlp/roberta-base-formality-ranker")
# df['is_formal'] = df['text'].apply(lambda x: formality(x)[0]['label'])
# df['is_formal'] = df['is_formal'].apply(lambda x: int(1) if x == 'formal' else 0)

In [116]:
#We also analyze the toxicity of each text. We will use the pre-trained toxicity model from Hugging Face.

nlp = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-offensive')
# df['is_toxic'] = df['text'].apply(lambda x: nlp(x)[0]['label'])
# df['is_toxic'] = df['is_toxic'].apply(lambda x: int(1) if x == 'offensive' else 0)

In [47]:
#We include as columns the sentiment, formality and toxicity of each text from sentiment.csv, is_formal.csv and is_toxic.csv respectively.
df['sentiment'] = pd.read_csv('sentiment.csv')
df['is_formal'] = pd.read_csv('is_formal.csv')
df['is_toxic'] = pd.read_csv('is_toxic.csv')
df['country'] = pd.read_csv('country.csv')


In [48]:
df.head()

Unnamed: 0,id,keyword,text,target,text length,word_count,has_hashtag,has_at,has_link,sentiment,is_formal,is_toxic,country
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,69,13,1,0,0,0,0,0,Unknown
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,38,7,0,0,0,0,1,0,Unknown
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,133,22,0,0,0,0,1,0,Unknown
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,65,8,1,0,0,0,1,0,Unknown
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,88,16,1,0,0,0,0,0,Unknown


In [49]:
#We also look for non-dictionary word counts
from nltk.corpus import words, wordnet
#nltk.download('words')
words = set(words.words())

def does_exist(word, words):
    if word[0] == '#':
        word = word[1:len(word)] #We remove the hashtag
    if len(wordnet.synsets(word.lower()))>0 or word.lower() in words or word in words:
        return 1
    else:
        return 0

list_words = {word.lower() for word in words}


df['non_dictionary_word_count'] = df['text'].apply(lambda x: len([word for word in str(x).split() if not does_exist(word, list_words)]))
df.head()


Unnamed: 0,id,keyword,text,target,text length,word_count,has_hashtag,has_at,has_link,sentiment,is_formal,is_toxic,country,non_dictionary_word_count
0,1,fatalities,Our Deeds are the Reason of this #earthquake M...,1,69,13,1,0,0,0,0,0,Unknown,0
1,4,fatalities,Forest fire near La Ronge Sask. Canada,1,38,7,0,0,0,0,1,0,Unknown,2
2,5,fatalities,All residents asked to 'shelter in place' are ...,1,133,22,0,0,0,0,1,0,Unknown,3
3,6,fatalities,"13,000 people receive #wildfires evacuation or...",1,65,8,1,0,0,0,1,0,Unknown,1
4,7,fatalities,Just got sent this photo from Ruby #Alaska as ...,1,88,16,1,0,0,0,0,0,Unknown,0


In [66]:
#Firstly, we process the text column to remove the punctuation and the stopwords and remove hashtags and @s.
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
def text_processing(text):
    text = text.translate(str.maketrans('', '', string.punctuation)) #Discarting punctuation signs
    #we split the text into words:
    
    text = [word[1:len(word)] if word.startswith('#') else word for word in text.split()] #Remove the # from hashtags and split the text
    text = [word.lower() for word in text if word.lower() not in stop] #Discarting stopwords
    text = [word for word in text if not word.startswith('@')] #Discarting @s
    text = [word for word in text if not word.startswith('http')]
    #A function to process the english slang could be created here
    return " ".join(text)

#We apply the text_processing function to the text column
df['text'] = df['text'].apply(text_processing)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\genis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [54]:
#compute a word2vec embedding for each text. We will use the pre-trained word2vec model from Google.
from gensim.models import Word2Vec

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Load GloVe model 
glove_input_file = './word_embeddings/glove.6B.100d.txt/glove.6B.100d.txt'
word2vec_output_file = './word_embeddings/glove.6B.100d.word2vec/glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False) #50 seconds lasts

# Define function to create averaged word vector for a text
def text_to_vector(text):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

  glove2word2vec(glove_input_file, word2vec_output_file)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\genis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [71]:
#We create a new column called 'word2vec' which contains the word2vec embedding of each text.
df['word2vec'] = df['text'].apply(text_to_vector)
df.head()

Unnamed: 0,id,keyword,text,target,text length,word_count,has_hashtag,has_at,has_link,sentiment,is_formal,is_toxic,country,non_dictionary_word_count,word2vec
0,1,fatalities,deeds reason earthquake may allah forgive us,1,69,13,1,0,0,0,0,0,Unknown,0,"[0.2537094, 0.52158827, 0.47551244, -0.2304120..."
1,4,fatalities,forest fire near la ronge sask canada,1,38,7,0,0,0,0,1,0,Unknown,2,"[-0.09041647, -0.33786145, 0.27815086, 0.42031..."
2,5,fatalities,residents asked shelter place notified officer...,1,133,22,0,0,0,0,1,0,Unknown,3,"[0.1305019, 0.06479403, -0.04907874, -0.153829..."
3,6,fatalities,13000 people receive wildfires evacuation orde...,1,65,8,1,0,0,0,1,0,Unknown,1,"[0.08861657, 0.25795904, 0.075865015, -0.09882..."
4,7,fatalities,got sent photo ruby alaska smoke wildfires pou...,1,88,16,1,0,0,0,0,0,Unknown,0,"[0.0026274389, 0.20838793, 0.19368011, -0.3950..."


In [86]:
#We will study the lenghts of the word2vec list. We will study the max, min and mean lengths.
print("max len: ",df['word2vec'].apply(len).max())
print("min len: ",df['word2vec'].apply(len).min())
print("mean len: ",df['word2vec'].apply(len).mean())
#As we can see, each of the word2vec lists has 100 elements.

max len:  100
min len:  100
mean len:  100.0


In [87]:
#Each element of the word2vec column is a numpy array. We will create a new column for each element of the numpy array.
for i in range(100):
    df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
df.drop('word2vec', axis=1, inplace=True)

  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])
  df[f'wvect_{i}'] = df['word2vec'].apply(lambda x: x[i])


### X, Y split

In [88]:
y = df['target']
X = df.drop(['id', 'target'], axis=1)
#split:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=33)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=33, test_size=0.25)


In [89]:
X_train.drop(['text'], axis=1, inplace=True)
X_train.head()
key_columns = X_train.columns


### ENCODING

In [90]:
#We compute a target encoding for the keyword column and the country column.
from category_encoders import TargetEncoder
te_keyword = TargetEncoder()
te_country = TargetEncoder()
te_keyword.fit(X_train['keyword'], y_train)
te_country.fit(X_train['country'], y_train)
X_train['keyword'] = te_keyword.transform(X_train['keyword'])
X_train['country'] = te_country.transform(X_train['country'])
X_train.head()


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


Unnamed: 0,keyword,text length,word_count,has_hashtag,has_at,has_link,sentiment,is_formal,is_toxic,country,...,wvect_90,wvect_91,wvect_92,wvect_93,wvect_94,wvect_95,wvect_96,wvect_97,wvect_98,wvect_99
3676,0.27305,108,13,1,1,1,0,0,0,0.415073,...,-0.054044,0.046271,-0.018507,-0.326123,-0.428052,-0.103477,-0.301681,-0.278915,0.282725,0.116391
6179,0.33808,118,17,0,0,1,0,0,0,0.579375,...,0.042364,0.108716,-0.086385,0.238688,-0.264332,0.085559,-0.127437,0.16175,0.280647,-0.199788
6297,0.252433,138,24,0,1,0,0,1,0,0.411804,...,0.020816,-0.123129,-0.045104,0.208563,-0.349348,0.060759,-0.135892,-0.002793,0.060901,-0.141412
5429,0.51543,139,20,1,0,1,0,0,0,0.411804,...,-0.010154,-0.347303,0.022852,0.049404,-0.332427,0.268057,-0.194701,-0.041288,0.28159,-0.382639
2420,0.650722,107,19,0,0,0,0,0,0,0.415073,...,0.138369,0.079113,-0.150682,0.032456,-0.868274,0.238844,-0.010302,-0.066319,0.561664,0.096033


In [91]:
#We will normalize with standard scaler the whole dataset.

from sklearn.preprocessing import StandardScaler
features_scaled = ['keyword', 'country', 'text length', 'word_count', 'non_dictionary_word_count']
scaler = StandardScaler()
scaler.fit(X_train[features_scaled])
X_train[features_scaled] = scaler.transform(X_train[features_scaled])
X_train = pd.DataFrame(X_train, columns=key_columns)
X_train.head()

Unnamed: 0,keyword,text length,word_count,has_hashtag,has_at,has_link,sentiment,is_formal,is_toxic,country,...,wvect_90,wvect_91,wvect_92,wvect_93,wvect_94,wvect_95,wvect_96,wvect_97,wvect_98,wvect_99
3676,-0.856466,0.208423,-0.327457,1,1,1,0,0,0,-0.143314,...,-0.054044,0.046271,-0.018507,-0.326123,-0.428052,-0.103477,-0.301681,-0.278915,0.282725,0.116391
6179,-0.494509,0.505055,0.370895,0,0,1,0,0,0,2.794201,...,0.042364,0.108716,-0.086385,0.238688,-0.264332,0.085559,-0.127437,0.16175,0.280647,-0.199788
6297,-0.971226,1.098319,1.593011,0,1,0,0,1,0,-0.201767,...,0.020816,-0.123129,-0.045104,0.208563,-0.349348,0.060759,-0.135892,-0.002793,0.060901,-0.141412
5429,0.49263,1.127982,0.894659,1,0,1,0,0,0,-0.201767,...,-0.010154,-0.347303,0.022852,0.049404,-0.332427,0.268057,-0.194701,-0.041288,0.28159,-0.382639
2420,1.245672,0.178759,0.720071,0,0,0,0,0,0,-0.143314,...,0.138369,0.079113,-0.150682,0.032456,-0.868274,0.238844,-0.010302,-0.066319,0.561664,0.096033


In [82]:
np.array(df['word2vec'].tolist())

array([[ 0.25370941,  0.52158827,  0.47551244, ..., -0.43344015,
         0.03718871, -0.02817114],
       [-0.09041647, -0.33786145,  0.27815086, ...,  0.03711   ,
         0.49718142, -0.22046529],
       [ 0.1305019 ,  0.06479403, -0.04907874, ...,  0.37766913,
         0.32037783,  0.05776972],
       ...,
       [-0.09563959, -0.03029093,  0.08261544, ..., -0.13416302,
         0.47079039,  0.16659856],
       [ 0.15360999,  0.3709186 , -0.14963678, ...,  0.123735  ,
         0.76839125, -0.06275225],
       [-0.14072743, -0.09117275,  0.26531374, ...,  0.20203838,
         0.72892749,  0.130686  ]])

### MODELING

In [92]:
#Let's compute a simple logistic regression model. The metric we will use is the F1 score.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
print("F1-Score Train:",f1_score(y_train, y_pred))
print("Accuracy Train:",accuracy_score(y_train, y_pred))
# print("------------------------------------------")
# print("F1-Score Test:",f1_score(y_test, lr.predict(X_test)))
# print("Accuracy Test:",accuracy_score(y_test, lr.predict(X_test)))


F1-Score Train: 0.7812840043525571
Accuracy Train: 0.8213968366802914


In [93]:
#Bayesian Search CV
from skopt import BayesSearchCV

# We define the search space for logistic regression
log_search_spaces = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'penalty': ['l2']
}
# compute the best hyperparameters
opt = BayesSearchCV( LogisticRegression(), log_search_spaces, scoring='f1', n_iter=20, cv=5)
opt.fit(X_train, y_train)
print("Best score: %0.3f" % opt.best_score_)
print("Best parameters set:",opt.best_estimator_.get_params())



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best score: 0.768
Best parameters set: {'C': 0.18259831503207236, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [99]:
from sklearn.ensemble import RandomForestClassifier

random_search_spaces = {
    'n_estimators': (50, 300),
    'max_features': (2,13),
    #'max_depth': (5, 500),
    #'min_samples_split': (2, 10),
    # 'min_samples_leaf': (1, 10),
    # 'bootstrap': [True, False]
}
opt = BayesSearchCV( RandomForestClassifier(), random_search_spaces, scoring='f1', n_iter=20, cv=5)
opt.fit(X_train, y_train)
print("Best score: %0.3f" % opt.best_score_)
print("Best parameters set:",opt.best_estimator_.get_params())


KeyboardInterrupt: 

In [98]:
#Catboost
from catboost import CatBoostClassifier

cat_search_spaces = {
    'iterations': (10, 1000),
    'learning_rate': (0.01, 0.5),
    'depth': (1, 8),
    # 'l2_leaf_reg': (2, 30),
    # 'border_count': (1, 255),
    # 'bagging_temperature': (0, 1)
}
opt = BayesSearchCV( CatBoostClassifier(), cat_search_spaces, scoring='f1', n_iter=20, cv=5)
opt.fit(X_train, y_train)
print("Best score: %0.3f" % opt.best_score_)
print("Best parameters set:",opt.best_estimator_.get_params())

0:	learn: 0.6092957	total: 141ms	remaining: 1m 21s
1:	learn: 0.5752981	total: 143ms	remaining: 41.2s
2:	learn: 0.5451781	total: 145ms	remaining: 27.8s
3:	learn: 0.5271185	total: 146ms	remaining: 21s
4:	learn: 0.5162396	total: 148ms	remaining: 17s
5:	learn: 0.5076245	total: 150ms	remaining: 14.3s
6:	learn: 0.5016166	total: 153ms	remaining: 12.5s
7:	learn: 0.4967026	total: 156ms	remaining: 11.1s
8:	learn: 0.4908126	total: 159ms	remaining: 10s
9:	learn: 0.4863800	total: 161ms	remaining: 9.16s
10:	learn: 0.4826311	total: 163ms	remaining: 8.43s
11:	learn: 0.4777249	total: 166ms	remaining: 7.82s
12:	learn: 0.4731530	total: 168ms	remaining: 7.31s
13:	learn: 0.4705149	total: 170ms	remaining: 6.85s
14:	learn: 0.4671555	total: 172ms	remaining: 6.46s
15:	learn: 0.4648637	total: 174ms	remaining: 6.12s
16:	learn: 0.4622240	total: 176ms	remaining: 5.81s
17:	learn: 0.4591389	total: 178ms	remaining: 5.53s
18:	learn: 0.4574010	total: 179ms	remaining: 5.28s
19:	learn: 0.4556737	total: 181ms	remaining: 5

In [169]:
#xgboost:
from xgboost import XGBClassifier
xgb_search_spaces = {
    'n_estimators': (50, 400),
    'max_depth': (1, 7),
    'learning_rate': (0.01, 0.5),
    # 'gamma': (0, 1),
    # 'min_child_weight': (0, 5),
    # 'max_delta_step': (0, 5),
}

opt = BayesSearchCV( XGBClassifier(), xgb_search_spaces, scoring='f1', n_iter=20, cv=5)
opt.fit(X_train, y_train)
print("Best score: %0.3f" % opt.best_score_)
print("Best parameters set:",opt.best_estimator_.get_params())

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_catego

Best score: 0.778
Best parameters set: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.09343218780084628, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 3, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 148, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}


### Prepare and test on X_test

In [96]:
X_test['keyword'] = te_keyword.transform(X_test['keyword'])
X_test['country'] = te_country.transform(X_test['country'])
X_test[features_scaled] = scaler.transform(X_test[features_scaled])
X_test = pd.DataFrame(X_test, columns=key_columns)

  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


In [170]:
#log_best_model = {'C': 0.0001, 'penalty': 'l2'}
log_best_model = {}
random_best_model = {'max_depth': 5, 'max_features': 2, 'min_samples_split': 2, 'n_estimators': 5} 
cat_best_model = {'depth': 3, 'iterations': 990, 'learning_rate': 0.08624}
xgb_best_model = {'learning_rate': 0.093432, 'max_depth': 3, 'n_estimators': 148}

lr_test  = LogisticRegression(**log_best_model)
random_test = RandomForestClassifier(**random_best_model)
cat_test = CatBoostClassifier(**cat_best_model, verbose=False)
xgb_test = XGBClassifier(**xgb_best_model)

lr_test.fit(X_train, y_train)
random_test.fit(X_train, y_train)
cat_test.fit(X_train, y_train)
xgb_test.fit(X_train, y_train)

print("log F1-Score Test:",f1_score(y_test, lr_test.predict(X_test)))
print("log Accuracy Test:",accuracy_score(y_test, lr_test.predict(X_test)))
print(classification_report(y_test, lr_test.predict(X_test)))
print("----------------------------------------------------------------------------------------")
print("random F1-Score Test:",f1_score(y_test, random_test.predict(X_test)))
print("random Accuracy Test:",accuracy_score(y_test, random_test.predict(X_test)))
print(classification_report(y_test, random_test.predict(X_test)))
print("----------------------------------------------------------------------------------------")
print("cat F1-Score Test:",f1_score(y_test, cat_test.predict(X_test)))
print("cat Accuracy Test:",accuracy_score(y_test, cat_test.predict(X_test)))
print(classification_report(y_test, cat_test.predict(X_test)))
print("----------------------------------------------------------------------------------------")
print("xgb F1-Score Test:",f1_score(y_test, xgb_test.predict(X_test)))
print("xgb Accuracy Test:",accuracy_score(y_test, xgb_test.predict(X_test)))
print(classification_report(y_test, xgb_test.predict(X_test)))

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


log F1-Score Test: 0.7570694087403599
log Accuracy Test: 0.7985074626865671
              precision    recall  f1-score   support

           0       0.81      0.84      0.83      1076
           1       0.78      0.74      0.76       800

    accuracy                           0.80      1876
   macro avg       0.80      0.79      0.79      1876
weighted avg       0.80      0.80      0.80      1876

----------------------------------------------------------------------------------------
random F1-Score Test: 0.6657101865136298
random Accuracy Test: 0.7515991471215352
              precision    recall  f1-score   support

           0       0.74      0.88      0.80      1076
           1       0.78      0.58      0.67       800

    accuracy                           0.75      1876
   macro avg       0.76      0.73      0.73      1876
weighted avg       0.76      0.75      0.74      1876

----------------------------------------------------------------------------------------
cat F1-Sco

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [168]:
# Print feature importances
# print("----------------------------------------------------------------------------------------")
# print(" Random Forest Feature Importances:")
# print(random_test.feature_importances_)
# print("----------------------------------------------------------------------------------------")
# print("CatBoosting Feature Importances:")
# print(cat_test.feature_importances_)
# print("----------------------------------------------------------------------------------------")

### EVALUATION on test.csv

In [171]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [172]:
ids_test = df_test['id']

In [173]:
df_test.isnull().sum()/len(df)*100

id           0.000000
keyword      0.346528
location    14.727442
text         0.000000
dtype: float64

In [174]:
df_test['text length'] = df_test['text'].apply(len)
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))
df_test['has_hashtag'] = df_test['text'].apply(lambda x: 1 if '#' in str(x) else 0)
df_test['has_at'] = df_test['text'].apply(lambda x: 1 if '@' in str(x) else 0)
df_test['has_link'] = df_test['text'].apply(lambda x: 1 if 'http' in str(x) else 0)
df_test['keyword'].fillna('fatalities', inplace=True) #A fatality includes lots of disasters

In [175]:

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_user_agent")

def get_country(location):
    if location == 'Unknown':
        return 'Unknown'
    try:
        country = geolocator.geocode(location).raw['display_name'].split(',')[-1]
        return country
    except Exception as e:
        print(f"Error occurred while geocoding location: {location}")
        print(f"Index: {int(df_test[df_test['location'] == location].index[0])/len(df_test)*100}")
        return 'Unknown'

#change NaN to Unknown
# df_test['location'].fillna('Unknown', inplace=True)
# df_test['country'] = df_test['location'].apply(get_country)
df_test.drop('location', axis=1, inplace=True)
# df_test['country'].to_csv('country_test.csv', index=False)

df_test['country'] = pd.read_csv('country_test.csv')

In [176]:
def get_sentiment(x):
  label = classifier(x)[0]['label']
  print(f"Index: {int(df_test[df_test['text'] == x].index[0])/len(df_test)*100}")
  if label == 'NEGATIVE':
    return 0
  else:
    return 1
# df_test['sentiment'] = df_test['text'].apply(lambda x: get_sentiment(x))
# df_test['sentiment'].to_csv('sentiment_test.csv', index=False)
df_test['sentiment'] = pd.read_csv('sentiment_test.csv')

# df_test['is_formal'] = df_test['text'].apply(lambda x: formality(x)[0]['label'])
# df_test['is_formal'] = df_test['is_formal'].apply(lambda x: int(1) if x == 'formal' else 0)
# df_test['is_formal'].to_csv('is_formal_test.csv', index=False)
df_test['is_formal'] = pd.read_csv('is_formal_test.csv')

# df_test['is_toxic'] = df_test['text'].apply(lambda x: nlp(x)[0]['label'])
# df_test['is_toxic'] = df_test['is_toxic'].apply(lambda x: int(1) if x == 'offensive' else 0)
# df_test['is_toxic'].to_csv('is_toxic_test.csv', index=False)
df_test['is_toxic'] = pd.read_csv('is_toxic_test.csv')

In [177]:
df_test['non_dictionary_word_count'] = df_test['text'].apply(lambda x: len([word for word in str(x).split() if not does_exist(word, list_words)]))
df_test['text'] = df_test['text'].apply(text_processing)
df_test['word2vec'] = df_test['text'].apply(text_to_vector)

In [178]:
for i in range(100):
    df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
df_test.drop('word2vec', axis=1, inplace=True)

  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])
  df_test[f'wvect_{i}'] = df_test['word2vec'].apply(lambda x: x[i])


In [179]:
#encoding:
df_test['keyword'] = te_keyword.transform(df_test['keyword'])
df_test['country'] = te_country.transform(df_test['country'])

df_test[features_scaled] = scaler.transform(df_test[features_scaled])
df_test = pd.DataFrame(df_test, columns=key_columns)

  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


In [156]:
#Submission catboost
y_pred_cat = cat_test.predict(df_test)
df_test['target'] = y_pred_cat
df_test['id'] = ids_test
df_test[['id', 'target']].to_csv('submission_catboost.csv', index=False)

  df_test['target'] = y_pred_cat
  df_test['id'] = ids_test


In [180]:
#Submission xgboost
df_test['target'] = xgb_test.predict(df_test)   
df_test['id'] = ids_test
df_test[['id', 'target']].to_csv('submission_xgboost.csv', index=False)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  df_test['target'] = xgb_test.predict(df_test)
  df_test['id'] = ids_test


In [142]:
'keyword' in df_test.columns

True

-------------------------------------------------------------------------------------

https://medium.com/swlh/nlp-all-them-features-every-feature-that-can-be-extracted-from-text-7032c0c87dee

In [34]:
example = "I am sad right now"
print(classifier(example)[0]['label'])

NEGATIVE


In [None]:
geolocator.geocode('London, UK').raw['display_name'].split(',')[-1]

'United States'

In [31]:
from transformers import pipeline
#we will use roberta-base-formality-ranker

formality = pipeline('text-classification', model="s-nlp/roberta-base-formality-ranker")
print(formality("to whom it may concern"))

[{'label': 'formal', 'score': 0.9851468205451965}]


In [37]:
nlp = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-offensive')
print(nlp("fuck you"))

[{'label': 'offensive', 'score': 0.8740859627723694}]
