In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import re
import string
import nltk
from nltk.stem.porter import *

In [2]:
entrenamiento_df = pd.read_csv('Archivos/entrenamiento_df.csv')
entrenamiento_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
test_df = pd.read_csv('Archivos/test_df.csv')
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [4]:
entrenamiento_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [6]:
#La columna location tiene demasiados nulos, mejor la elimino

entrenamiento_df = entrenamiento_df.drop('location', axis=1)
test_df = test_df.drop('location', axis=1)

display(entrenamiento_df)
display(test_df)

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
7608,10869,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,Police investigating after an e-bike collided ...,1


Unnamed: 0,id,keyword,text
0,0,,Just happened a terrible car crash
1,2,,"Heard about #earthquake is different cities, s..."
2,3,,"there is a forest fire at spot pond, geese are..."
3,9,,Apocalypse lighting. #Spokane #wildfires
4,11,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,10861,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,Storm in RI worse than last hurricane. My city...
3260,10868,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [7]:
#Agrego features

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

entrenamiento_df['hashtags_count'] = entrenamiento_df['text'].apply(lambda x: x.count('#'))
test_df['hashtags_count'] = test_df['text'].apply(lambda x: x.count('#'))

entrenamiento_df['len_text'] = entrenamiento_df['text'].transform(lambda x: len(x))
test_df['len_text'] = test_df['text'].transform(lambda x: len(x))

entrenamiento_df['palabras_count'] = entrenamiento_df['text'].transform(lambda x: len(x.split()))
test_df['palabras_count'] = test_df['text'].transform(lambda x: len(x.split()))

entrenamiento_df['mentions_count'] = entrenamiento_df['text'].apply(lambda x: x.count('@'))
test_df['mentions_count'] = test_df['text'].apply(lambda x: x.count('@'))

entrenamiento_df['palabras_unicas_count'] = entrenamiento_df['text'].apply(lambda x: len(set(str(x).split())))
test_df['palabras_unicas_count'] = test_df['text'].apply(lambda x: len(set(str(x).split())))

entrenamiento_df['stopwords_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
test_df['stopwords_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

entrenamiento_df['url_count'] = entrenamiento_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
test_df['url_count'] = test_df['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

entrenamiento_df['longitud_palabra_mean'] = entrenamiento_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df['longitud_palabra_mean'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

entrenamiento_df['punctuation_count'] = entrenamiento_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test_df['punctuation_count'] = test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

entrenamiento_df['caps_count'] = entrenamiento_df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))
test_df['caps_count'] = test_df['text'].apply(lambda x: sum(1 for c in str(x) if c.isupper()))

entrenamiento_df['digit_count'] = entrenamiento_df['text'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))
test_df['digit_count'] = test_df['text'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))

entrenamiento_df['palabras_unicas_ratio'] = entrenamiento_df['palabras_unicas_count'] / entrenamiento_df['palabras_count']
test_df['palabras_unicas_ratio'] = test_df['palabras_unicas_count'] / test_df['palabras_count']

entrenamiento_df['caps_ratio'] = entrenamiento_df['caps_count'] / entrenamiento_df['len_text']
test_df['caps_ratio'] = test_df['caps_count'] / test_df['len_text']

display(entrenamiento_df.head())
display(test_df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/elnic10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,text,target,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio
0,1,,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1,10,0,1.0,0.144928
1,4,,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1,5,0,1.0,0.131579
2,5,,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3,2,0,0.909091,0.015038
3,6,,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125,2,1,5,1.0,0.015385
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.5,2,3,0,0.9375,0.034091


Unnamed: 0,id,keyword,text,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio
0,0,,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0,1,0,1.0,0.029412
1,2,,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3,1,0,1.0,0.015625
2,3,,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2,1,0,1.0,0.010417
3,9,,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.25,3,2,0,1.0,0.05
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.75,0,4,2,1.0,0.088889


In [8]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punc(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

In [9]:
slang_abbrev_dict = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'ATK': 'At The Keyboard',
    'ATM': 'At The Moment',
    'A3': 'Anytime, Anywhere, Anyplace',
    'BAK': 'Back At Keyboard',
    'BBL': 'Be Back Later',
    'BBS': 'Be Back Soon',
    'BFN': 'Bye For Now',
    'B4N': 'Bye For Now',
    'BRB': 'Be Right Back',
    'BRT': 'Be Right There',
    'BTW': 'By The Way',
    'B4': 'Before',
    'B4N': 'Bye For Now',
    'CU': 'See You',
    'CUL8R': 'See You Later',
    'CYA': 'See You',
    'FAQ': 'Frequently Asked Questions',
    'FC': 'Fingers Crossed',
    'FWIW': 'For What It\'s Worth',
    'FYI': 'For Your Information',
    'GAL': 'Get A Life',
    'GG': 'Good Game',
    'GN': 'Good Night',
    'GMTA': 'Great Minds Think Alike',
    'GR8': 'Great!',
    'G9': 'Genius',
    'IC': 'I See',
    'ICQ': 'I Seek you',
    'ILU': 'I Love You',
    'IMHO': 'In My Humble Opinion',
    'IMO': 'In My Opinion',
    'IOW': 'In Other Words',
    'IRL': 'In Real Life',
    'KISS': 'Keep It Simple, Stupid',
    'LDR': 'Long Distance Relationship',
    'LMAO': 'Laugh My Ass Off',
    'LOL': 'Laughing Out Loud',
    'LTNS': 'Long Time No See',
    'L8R': 'Later',
    'MTE': 'My Thoughts Exactly',
    'M8': 'Mate',
    'NRN': 'No Reply Necessary',
    'OIC': 'Oh I See',
    'OMG': 'Oh My God',
    'PITA': 'Pain In The Ass',
    'PRT': 'Party',
    'PRW': 'Parents Are Watching',
    'QPSA?': 'Que Pasa?',
    'ROFL': 'Rolling On The Floor Laughing',
    'ROFLOL': 'Rolling On The Floor Laughing Out Loud',
    'ROTFLMAO': 'Rolling On The Floor Laughing My Ass Off',
    'SK8': 'Skate',
    'STATS': 'Your sex and age',
    'ASL': 'Age, Sex, Location',
    'THX': 'Thank You',
    'TTFN': 'Ta-Ta For Now!',
    'TTYL': 'Talk To You Later',
    'U': 'You',
    'U2': 'You Too',
    'U4E': 'Yours For Ever',
    'WB': 'Welcome Back',
    'WTF': 'What The Fuck',
    'WTG': 'Way To Go!',
    'WUF': 'Where Are You From?',
    'W8': 'Wait',
    '7K': 'Sick:-D Laugher'
}

def unslang(text):
    if text.upper() in slang_abbrev_dict.keys():
        return slang_abbrev_dict[text.upper()]
    else:
        return text

In [10]:
def tokenization(text):
    text = re.split('\W+', text)
    return text

stemmer = PorterStemmer()

def remove_stopwords(text):
    text = [word for word in text if word not in stopwords]
    return text

def stemming(text):
    text = [stemmer.stem(word) for word in text]
    return text

In [11]:
for datas in [entrenamiento_df, test_df]:
    datas['cleaned_text'] = datas['text'].apply(lambda x : remove_url(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_html(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_emoji(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : unslang(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_punc(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : tokenization(x.lower()))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : remove_stopwords(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : stemming(x))
    datas['cleaned_text'] = datas['cleaned_text'].apply(lambda x : ' '.join(x))

entrenamiento_df

Unnamed: 0,id,keyword,text,target,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
0,1,,Our Deeds are the Reason of this #earthquake M...,1,1,69,13,0,13,6,0,4.384615,1,10,0,1.000000,0.144928,deed reason earthquak may allah forgiv us
1,4,,Forest fire near La Ronge Sask. Canada,1,0,38,7,0,7,0,0,4.571429,1,5,0,1.000000,0.131579,forest fire near la rong sask canada
2,5,,All residents asked to 'shelter in place' are ...,1,0,133,22,0,20,11,0,5.090909,3,2,0,0.909091,0.015038,resid ask shelter place notifi offic evacu she...
3,6,,"13,000 people receive #wildfires evacuation or...",1,1,65,8,0,8,1,0,7.125000,2,1,5,1.000000,0.015385,13000 peopl receiv wildfir evacu order califor...
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,2,88,16,0,15,7,0,4.500000,2,3,0,0.937500,0.034091,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,,Two giant cranes holding a bridge collapse int...,1,0,83,11,0,11,2,1,6.636364,5,7,1,1.000000,0.084337,two giant crane hold bridg collaps nearbi home
7609,10870,,@aria_ahrary @TheTawniest The out of control w...,1,0,125,20,2,17,9,0,5.300000,5,6,0,0.850000,0.048000,ariaahrari thetawniest control wild fire calif...
7610,10871,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,0,65,8,0,8,2,1,7.250000,11,10,9,1.000000,0.153846,m194 0104 utc5km volcano hawaii
7611,10872,,Police investigating after an e-bike collided ...,1,0,137,19,0,19,5,0,6.263158,5,4,0,1.000000,0.029197,polic investig ebik collid car littl portug eb...


In [12]:
test_df

Unnamed: 0,id,keyword,text,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
0,0,,Just happened a terrible car crash,0,34,6,0,6,2,0,4.833333,0,1,0,1.000000,0.029412,happen terribl car crash
1,2,,"Heard about #earthquake is different cities, s...",1,64,9,0,9,2,0,6.222222,3,1,0,1.000000,0.015625,heard earthquak differ citi stay safe everyon
2,3,,"there is a forest fire at spot pond, geese are...",0,96,19,0,19,9,0,4.105263,2,1,0,1.000000,0.010417,forest fire spot pond gees flee across street ...
3,9,,Apocalypse lighting. #Spokane #wildfires,2,40,4,0,4,0,0,9.250000,3,2,0,1.000000,0.050000,apocalyps light spokan wildfir
4,11,,Typhoon Soudelor kills 28 in China and Taiwan,0,45,8,0,8,2,0,4.750000,0,4,2,1.000000,0.088889,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0,55,8,0,7,0,0,6.000000,0,45,0,0.875000,0.818182,earthquak safeti lo angel ûò safeti fasten xrwn
3259,10865,,Storm in RI worse than last hurricane. My city...,0,139,23,0,22,6,0,5.086957,5,7,6,0.956522,0.050360,storm ri wors last hurrican cityamp3oth hardes...
3260,10868,,Green Line derailment in Chicago http://t.co/U...,0,55,6,0,6,1,1,8.333333,5,9,0,1.000000,0.163636,green line derail chicago
3261,10874,,MEG issues Hazardous Weather Outlook (HWO) htt...,0,65,7,0,7,0,1,8.428571,7,15,3,1.000000,0.230769,meg issu hazard weather outlook hwo


In [13]:
X = entrenamiento_df.drop(['id', 'text', 'target'], axis = 1)
y = entrenamiento_df['target']
display(X)
display(y)

Unnamed: 0,keyword,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
0,,1,69,13,0,13,6,0,4.384615,1,10,0,1.000000,0.144928,deed reason earthquak may allah forgiv us
1,,0,38,7,0,7,0,0,4.571429,1,5,0,1.000000,0.131579,forest fire near la rong sask canada
2,,0,133,22,0,20,11,0,5.090909,3,2,0,0.909091,0.015038,resid ask shelter place notifi offic evacu she...
3,,1,65,8,0,8,1,0,7.125000,2,1,5,1.000000,0.015385,13000 peopl receiv wildfir evacu order califor...
4,,2,88,16,0,15,7,0,4.500000,2,3,0,0.937500,0.034091,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,,0,83,11,0,11,2,1,6.636364,5,7,1,1.000000,0.084337,two giant crane hold bridg collaps nearbi home
7609,,0,125,20,2,17,9,0,5.300000,5,6,0,0.850000,0.048000,ariaahrari thetawniest control wild fire calif...
7610,,0,65,8,0,8,2,1,7.250000,11,10,9,1.000000,0.153846,m194 0104 utc5km volcano hawaii
7611,,0,137,19,0,19,5,0,6.263158,5,4,0,1.000000,0.029197,polic investig ebik collid car littl portug eb...


0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [14]:
test_df = test_df.drop(['id', 'text'], axis = 1)
test_df

Unnamed: 0,keyword,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
0,,0,34,6,0,6,2,0,4.833333,0,1,0,1.000000,0.029412,happen terribl car crash
1,,1,64,9,0,9,2,0,6.222222,3,1,0,1.000000,0.015625,heard earthquak differ citi stay safe everyon
2,,0,96,19,0,19,9,0,4.105263,2,1,0,1.000000,0.010417,forest fire spot pond gees flee across street ...
3,,2,40,4,0,4,0,0,9.250000,3,2,0,1.000000,0.050000,apocalyps light spokan wildfir
4,,0,45,8,0,8,2,0,4.750000,0,4,2,1.000000,0.088889,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,,0,55,8,0,7,0,0,6.000000,0,45,0,0.875000,0.818182,earthquak safeti lo angel ûò safeti fasten xrwn
3259,,0,139,23,0,22,6,0,5.086957,5,7,6,0.956522,0.050360,storm ri wors last hurrican cityamp3oth hardes...
3260,,0,55,6,0,6,1,1,8.333333,5,9,0,1.000000,0.163636,green line derail chicago
3261,,0,65,7,0,7,0,1,8.428571,7,15,3,1.000000,0.230769,meg issu hazard weather outlook hwo


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
#Reemplazo valores nulos de keyword por el mas frecuente

imp = SimpleImputer(strategy="most_frequent")
X_train['keyword'] = imp.fit_transform(X_train[['keyword']])
X_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,keyword,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
5151,obliterate,0,136,22,2,21,7,0,5.227273,4,4,0,0.954545,0.029412,dicehatem puppyshogun make sens paper beat roc...
6351,structural failure,0,128,17,1,17,8,1,6.588235,10,7,2,1.000000,0.054688,catoinstitut caus feder failur deepli structur...
3443,exploded,0,137,29,0,29,13,0,3.758621,2,4,0,1.000000,0.029197,well chane ipad screen fuck explod glass went ...
7164,war zone,0,53,12,0,11,6,0,3.500000,3,5,0,0.916667,0.094340,war drug turn us war zone
7037,typhoon,0,53,6,0,6,1,0,8.000000,1,6,0,1.000000,0.113208,obama declar disast typhoondevast saipan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,obliteration,0,96,16,1,16,8,0,5.062500,3,5,4,1.000000,0.052083,eganator2000 arent mani obliter server alway l...
5390,panic,0,132,28,0,27,14,0,3.750000,1,2,0,0.964286,0.015152,panic attack bc dont enough money drug alcohol...
860,blood,0,121,13,0,13,1,2,8.384615,11,41,6,1.000000,0.338843,omron hem712c automat blood pressur monitor st...
7603,siren,0,136,20,0,19,8,1,5.850000,8,7,2,0.950000,0.051471,offici say quarantin place alabama home possib...


In [17]:
X_test['keyword'] = imp.transform(X_test[['keyword']])
X_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,keyword,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
2644,destruction,0,66,11,0,11,6,0,5.090909,2,1,0,1.000000,0.015152,new weapon caus unimagin destruct
2227,deluge,1,119,21,4,20,9,0,4.714286,10,12,0,0.952381,0.100840,famp thing gishwh got soak delug go pad tampon...
5448,police,0,125,15,2,15,3,1,7.400000,12,17,3,1.000000,0.136000,dt georgegalloway rt galloway4mayor ûïthe col ...
132,aftershock,0,114,21,0,20,9,0,4.476190,3,3,0,0.952381,0.026316,aftershock back school kick great want thank e...
6845,trauma,0,105,17,0,17,5,0,5.235294,3,2,1,1.000000,0.019048,respons trauma children addict develop defens ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5209,obliterated,0,104,19,1,19,8,0,4.526316,2,6,0,1.000000,0.057692,glad got obliter x men first class fulli deser...
387,arson,0,105,13,0,13,3,1,7.153846,5,10,1,1.000000,0.095238,mourn notic stab arson victim stir û polit gri...
4848,mass murderer,0,81,11,0,11,2,1,6.454545,6,13,0,1.000000,0.160494,mass murder che guevara greet woman north korea
1032,body bags,0,126,14,0,14,0,2,8.071429,10,17,4,1.000000,0.134921,women flower print shoulder handbag cross bodi...


In [18]:
test_df['keyword'] = imp.transform(test_df[['keyword']])
test_df

Unnamed: 0,keyword,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,digit_count,palabras_unicas_ratio,caps_ratio,cleaned_text
0,siren,0,34,6,0,6,2,0,4.833333,0,1,0,1.000000,0.029412,happen terribl car crash
1,siren,1,64,9,0,9,2,0,6.222222,3,1,0,1.000000,0.015625,heard earthquak differ citi stay safe everyon
2,siren,0,96,19,0,19,9,0,4.105263,2,1,0,1.000000,0.010417,forest fire spot pond gees flee across street ...
3,siren,2,40,4,0,4,0,0,9.250000,3,2,0,1.000000,0.050000,apocalyps light spokan wildfir
4,siren,0,45,8,0,8,2,0,4.750000,0,4,2,1.000000,0.088889,typhoon soudelor kill 28 china taiwan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,siren,0,55,8,0,7,0,0,6.000000,0,45,0,0.875000,0.818182,earthquak safeti lo angel ûò safeti fasten xrwn
3259,siren,0,139,23,0,22,6,0,5.086957,5,7,6,0.956522,0.050360,storm ri wors last hurrican cityamp3oth hardes...
3260,siren,0,55,6,0,6,1,1,8.333333,5,9,0,1.000000,0.163636,green line derail chicago
3261,siren,0,65,7,0,7,0,1,8.428571,7,15,3,1.000000,0.230769,meg issu hazard weather outlook hwo


In [19]:
#One hot encoding para keyword

encoder = OneHotEncoder(sparse = False)
transformed = encoder.fit_transform(X_train[['keyword']].values)
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
X_train = pd.concat([X_train.reset_index(drop=True), onehot_df], axis = 1).drop(['keyword'], axis = 1)
X_train

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,0,136,22,2,21,7,0,5.227273,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,128,17,1,17,8,1,6.588235,10,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,137,29,0,29,13,0,3.758621,2,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,53,12,0,11,6,0,3.500000,3,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,53,6,0,6,1,0,8.000000,1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,0,96,16,1,16,8,0,5.062500,3,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5705,0,132,28,0,27,14,0,3.750000,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5706,0,121,13,0,13,1,2,8.384615,11,41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5707,0,136,20,0,19,8,1,5.850000,8,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
transformed = encoder.transform(X_test[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
X_test = pd.concat([X_test.reset_index(drop=True), onehot_df], axis = 1).drop(['keyword'], axis = 1)
X_test

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,0,66,11,0,11,6,0,5.090909,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,119,21,4,20,9,0,4.714286,10,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,125,15,2,15,3,1,7.400000,12,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,114,21,0,20,9,0,4.476190,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,105,17,0,17,5,0,5.235294,3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,0,104,19,1,19,8,0,4.526316,2,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0,105,13,0,13,3,1,7.153846,5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1901,0,81,11,0,11,2,1,6.454545,6,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1902,0,126,14,0,14,0,2,8.071429,10,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
transformed = encoder.transform(test_df[['keyword']])
onehot_df = pd.DataFrame(transformed, columns = encoder.get_feature_names())
test_df = pd.concat([test_df, onehot_df], axis = 1).drop(['keyword'], axis = 1)
test_df

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,x0_weapons,x0_whirlwind,x0_wild fires,x0_wildfire,x0_windstorm,x0_wounded,x0_wounds,x0_wreck,x0_wreckage,x0_wrecked
0,0,34,6,0,6,2,0,4.833333,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,64,9,0,9,2,0,6.222222,3,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,96,19,0,19,9,0,4.105263,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,40,4,0,4,0,0,9.250000,3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,45,8,0,8,2,0,4.750000,0,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0,139,23,0,22,6,0,5.086957,5,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0,55,6,0,6,1,1,8.333333,5,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0,65,7,0,7,0,1,8.428571,7,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#TF IDF

tfidfvectorizer = TfidfVectorizer()
transformed = tfidfvectorizer.fit_transform(X_train['cleaned_text']).toarray()
tfidf_df = pd.DataFrame(transformed, columns = tfidfvectorizer.get_feature_names())
X_train = pd.concat([X_train, tfidf_df], axis = 1).drop(['cleaned_text'], axis = 1)
X_train

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,ûïyou,ûò,ûò800000,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókodi,ûótech,ûówe
0,0,136,22,2,21,7,0,5.227273,4,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,128,17,1,17,8,1,6.588235,10,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,137,29,0,29,13,0,3.758621,2,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,53,12,0,11,6,0,3.500000,3,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,53,6,0,6,1,0,8.000000,1,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,0,96,16,1,16,8,0,5.062500,3,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5705,0,132,28,0,27,14,0,3.750000,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5706,0,121,13,0,13,1,2,8.384615,11,41,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5707,0,136,20,0,19,8,1,5.850000,8,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
transformed = tfidfvectorizer.transform(X_test['cleaned_text']).toarray()
tfidf_df = pd.DataFrame(transformed, columns = tfidfvectorizer.get_feature_names())
X_test = pd.concat([X_test, tfidf_df], axis = 1).drop(['cleaned_text'], axis = 1)
X_test

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,ûïyou,ûò,ûò800000,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókodi,ûótech,ûówe
0,0,66,11,0,11,6,0,5.090909,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,119,21,4,20,9,0,4.714286,10,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,125,15,2,15,3,1,7.400000,12,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,114,21,0,20,9,0,4.476190,3,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,105,17,0,17,5,0,5.235294,3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,0,104,19,1,19,8,0,4.526316,2,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0,105,13,0,13,3,1,7.153846,5,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1901,0,81,11,0,11,2,1,6.454545,6,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1902,0,126,14,0,14,0,2,8.071429,10,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
transformed = tfidfvectorizer.transform(test_df['cleaned_text']).toarray()
tfidf_df = pd.DataFrame(transformed, columns = tfidfvectorizer.get_feature_names())
test_df = pd.concat([test_df, tfidf_df], axis = 1).drop(['cleaned_text'], axis = 1)
test_df

Unnamed: 0,hashtags_count,len_text,palabras_count,mentions_count,palabras_unicas_count,stopwords_count,url_count,longitud_palabra_mean,punctuation_count,caps_count,...,ûïyou,ûò,ûò800000,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókodi,ûótech,ûówe
0,0,34,6,0,6,2,0,4.833333,0,1,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,64,9,0,9,2,0,6.222222,3,1,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,96,19,0,19,9,0,4.105263,2,1,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,40,4,0,4,0,0,9.250000,3,2,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,45,8,0,8,2,0,4.750000,0,4,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,55,8,0,7,0,0,6.000000,0,45,...,0.0,0.315936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0,139,23,0,22,6,0,5.086957,5,7,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0,55,6,0,6,1,1,8.333333,5,9,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0,65,7,0,7,0,1,8.428571,7,15,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#GridSearch with Valid set

penalties = ['l1','l2']
c = [0.0001,0.001,0.01,0.1,1,10]

for penalty in penalties:
    for value in c:
        log = LogisticRegression(penalty=penalty, C=value, random_state=7, max_iter=1000, solver='liblinear')
        log.fit(X_train, y_train)
        print('penalty: {}, C: {}'.format(penalty,value))
        print('Train Score', log.score(X_train, y_train))
        print('Test Score', log.score(X_test, y_test))
        print ('Log Regression Training f-1 score: %.4f' % f1_score(y_train, log.predict(X_train)))
        print ('Log Regression Test f-1 score: %.4f' % f1_score(y_test, log.predict(X_test)))

penalty: l1, C: 0.0001
Train Score 0.5694517428621475
Test Score 0.5730042016806722
Log Regression Training f-1 score: 0.0000
Log Regression Test f-1 score: 0.0000
penalty: l1, C: 0.001
Train Score 0.5796111403047819
Test Score 0.5756302521008403
Log Regression Training f-1 score: 0.3392
Log Regression Test f-1 score: 0.3311
penalty: l1, C: 0.01
Train Score 0.6416184971098265
Test Score 0.6580882352941176
Log Regression Training f-1 score: 0.5375
Log Regression Test f-1 score: 0.5444
penalty: l1, C: 0.1
Train Score 0.6673673147661587
Test Score 0.6796218487394958
Log Regression Training f-1 score: 0.5751
Log Regression Test f-1 score: 0.5810
penalty: l1, C: 1
Train Score 0.8187073042564372
Test Score 0.7951680672268907
Log Regression Training f-1 score: 0.7731
Log Regression Test f-1 score: 0.7468
penalty: l1, C: 10
Train Score 0.9877386582588895
Test Score 0.770483193277311
Log Regression Training f-1 score: 0.9857
Log Regression Test f-1 score: 0.7270
penalty: l2, C: 0.0001
Train Sco

In [26]:
penalties = ['l2']

c = [i/10 for i in range(9,16)]

for penalty in penalties:
    for value in c:
        log = LogisticRegression(penalty=penalty, C=value, random_state=7, max_iter=1000, solver='liblinear')
        log.fit(X_train, y_train)
        print('penalty: {}, C: {}'.format(penalty,value))
        print('Train Score', log.score(X_train, y_train))
        print('Test Score', log.score(X_test, y_test))
        print ('Log Regression Training f-1 score: %.4f' % f1_score(y_train, log.predict(X_train)))
        print ('Log Regression Test f-1 score: %.4f' % f1_score(y_test, log.predict(X_test)))

penalty: l2, C: 0.9
Train Score 0.8721317218427045
Test Score 0.8082983193277311
Log Regression Training f-1 score: 0.8423
Log Regression Test f-1 score: 0.7653
penalty: l2, C: 1.0
Train Score 0.8758101243650377
Test Score 0.8103991596638656
Log Regression Training f-1 score: 0.8470
Log Regression Test f-1 score: 0.7678
penalty: l2, C: 1.1
Train Score 0.8791382028376248
Test Score 0.8114495798319328
Log Regression Training f-1 score: 0.8510
Log Regression Test f-1 score: 0.7691
penalty: l2, C: 1.2
Train Score 0.883166929409704
Test Score 0.8125
Log Regression Training f-1 score: 0.8562
Log Regression Test f-1 score: 0.7707
penalty: l2, C: 1.3
Train Score 0.8875459800315292
Test Score 0.8114495798319328
Log Regression Training f-1 score: 0.8615
Log Regression Test f-1 score: 0.7703
penalty: l2, C: 1.4
Train Score 0.8901734104046243
Test Score 0.8114495798319328
Log Regression Training f-1 score: 0.8648
Log Regression Test f-1 score: 0.7712
penalty: l2, C: 1.5
Train Score 0.8929760028025

In [27]:
log = LogisticRegression(C=1.2, penalty='l2', random_state=7, max_iter=1000, solver='liblinear')
log.fit(X_train, y_train)

LogisticRegression(C=1.2, max_iter=1000, random_state=7, solver='liblinear')

In [28]:
y_test

2644    1
2227    0
5448    1
132     0
6845    0
       ..
5209    0
387     1
4848    1
1032    0
7195    1
Name: target, Length: 1904, dtype: int64

In [29]:
predicts = log.predict(test_df)

In [30]:
submit = pd.read_csv('setDeDatos/nlp-getting-started/sample_submission.csv')
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [31]:
submit['target'].value_counts()

0    3263
Name: target, dtype: int64

In [32]:
submit['target'] = pd.DataFrame({'target':predicts})
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [33]:
submit['target'].value_counts()

0    2006
1    1257
Name: target, dtype: int64

In [34]:
submit.to_csv('Submits/submit_tfidf_logReg.csv', index=False)