# Version with stop words removed and using tf-idf

In [1]:
!pip install emot

Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
[0m

In [2]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
from emot import EMOTICONS_EMO

In [3]:
df = pd.read_csv("../input/crowdflower-weather-twitter/train.csv")
test_tweets = pd.read_csv("../input/crowdflower-weather-twitter/test.csv")['tweet']
tweets = df['tweet']
targets = df[['s1','s2','s3','s4','s5','w1','w2','w3','w4','k1','k2','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15']]

In [4]:
# s_label = df[['s1','s2','s3','s4','s5']]
# w_label = df[['w1','w2','w3','w4']]
# k_label = df[['k1','k2','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15']]

In [5]:
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        if(emot in text):
            text = text.replace(emot,"_".join(EMOTICONS_EMO[emot].split()))
    return text

def remove_stopwords(tweets,stopwords):
    tweets_lists = []
    for text in tweets:

        lst=[]
        for token in text.split():
            if token.lower() not in stopwords:    #checking whether the word is not 
                lst.append(token)                    #present in the stopword list.
            
        tweets_lists.append(' '.join(lst))
            
    return tweets_lists

In [6]:
nlp = spacy.load('en_core_web_sm')
stopwords = nlp.Defaults.stop_words

In [7]:
# processed_tweets = tweets
processed_tweets  = [convert_emoticons(tweet) for tweet in tweets]
processed_test_tweets = [convert_emoticons(tweet) for tweet in test_tweets]
# processed_tweets = remove_stopwords(processed_tweets,stopwords)
# processed_tweets = [tweets.lower().replace(".","") for tweets in processed_tweets]

In [8]:
# from sklearn.feature_extraction.text import CountVectorizer

# count_vect = CountVectorizer(ngram_range=(1,3))
# #Build a BOW representation for the corpus
# bow_rep = count_vect.fit_transform(processed_tweets)

# #Look at the vocabulary mapping
# print("Our vocabulary: ", count_vect.vocabulary_)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
train_tfidf = tfidf.fit_transform(processed_tweets)
test_data = tfidf.transform(processed_test_tweets)

# #IDF for all words in the vocabulary
# print("IDF for all words in the vocabulary",tfidf.idf_)
# print("-"*10)
# #All words in the vocabulary.
# print("All words in the vocabulary",tfidf.get_feature_names())
# print("-"*10)

#TFIDF representation for all documents in our corpus 
# print("TFIDF representation for all documents in our corpus\n",sum(bow_rep_tfidf.toarray()[0])) 
# print("-"*10)

# temp = tfidf.transform(["dog and man are friends"])
# print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

# Train_test_split

## Model for first s

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, targets, test_size=0.3, random_state=42)
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

# X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
# print("Shape of X_valid: ", X_valid.shape)
# print("Shape of y_valid: ", y_valid.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)
# from sklearn.linear_model import Ridge,LinearRegression,SGDRegressor
# clf = SGDRegressor()                                    
# clf.fit(X_train, y_train)
# clf.score(X_test,y_test)
print("Shape of test data: ", test_data.shape)

Shape of X_train:  (54562, 42828)
Shape of y_train:  (54562, 23)
Shape of X_test:  (23384, 42828)
Shape of y_test:  (23384, 23)
Shape of test data:  (42157, 42828)


In [11]:
# from sklearn.model_selection import GridSearchCV
# parameters = {'alpha':[0.2,0.4,0.6,0.8,1.0]}
# ridge = Ridge()
# clf = GridSearchCV(ridge, parameters)
# clf.fit(train_tfidf, s_label.iloc[:,0])
# print("Best Score: ", clf.best_score_)
# print("Best Param: ", clf.best_params_)

In [12]:

# models = []
# for i in range(len(s_label.columns)):
#     parameters = {'alpha':[0.2,0.4,0.6,0.8,1.0]}
#     ridge = Ridge()
#     clf = GridSearchCV(ridge, parameters)
#     clf.fit(X_train, y_train.iloc[:,i]) 
#     models.append(clf)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

def get_ridge_best_parameters(X_train,y_train):
    best_params = []
    for i in range(len(y_train.columns)):
        parameters = {'alpha':[0.2,0.4,0.6,0.8,1.0]}
        model = Ridge()
        clf = GridSearchCV(model, parameters)
        clf.fit(X_train, y_train.iloc[:,i]) 
        best_params.append(clf.best_params_['alpha']) 
    return best_params

# def get_rf_best_parameters(X_train,y_train):
#     best_params = []
#     for i in range(len(y_train.columns)):
#         parameters = {'n_estimators':[20]}
#         model = RandomForestRegressor(random_state=42)
#         clf = GridSearchCV(model, parameters)
#         clf.fit(X_train, y_train.iloc[:,i]) 
#         best_params.append(clf.best_params_['n_estimators']) 
#     return best_params
                           
# def get_gbr_best_parameters(X_train,y_train):
#     best_params = []
#     for i in range(len(y_train.columns)):
#         parameters = {'n_estimators':[20,40,60,80,100]}
#         model = GradientBoostingRegressor(random_state=42)
#         clf = GridSearchCV(model, parameters)
#         clf.fit(X_train, y_train.iloc[:,i]) 
#         best_params.append(clf.best_params_['n_estimators']) 
#     return best_params

In [14]:
# ridge_best_params = get_ridge_best_parameters(X_train,y_train)
# print(ridge_best_params)
# rf_best_params = get_rf_best_parameters(X_train,y_train)
# print(rf_best_params)
# gbr_best_params = get_gbr_best_parameters(X_train,y_train)
# # print(ridge_best_params)
# # print(rf_best_params)
# print(gbr_best_params)

In [15]:
best_params = [1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8,1.0]

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
columns=['s1','s2','s3','s4','s5','w1','w2','w3','w4','k1','k2','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15']
predictions = {}
models = []
for i in range(len(y_train.columns)):
    estimators = [
        ('rg', Ridge(best_params[i])),
        ('rf', RandomForestRegressor(n_estimators=10,random_state=42)),
        ('gbr', GradientBoostingRegressor(n_estimators=10,random_state=42))
    ]
    reg = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression()
    )
    
    reg.fit(X_train,y_train.iloc[:,i])
    models.append(reg)
    
    ## Print each model training mse
    y_pred = reg.predict(X_test)
    print(mean_squared_error(y_test.iloc[:,i],y_pred))
    
    ## store predictions of test data for submissions
    predicted_value = reg.predict(test_data)
    predictions[columns[i]] = predicted_value
    

0.01192350862392939
0.05364331541393234
0.04402065155302788
0.0472105939922894
0.04057902870206601
0.07391511377997395
0.02925192606711944
0.04972031827804435
0.021660852069904427
0.0038103322410947714
0.015574279283302586
0.018620127403494443
0.0037144161071696
0.0003843431866702781
0.04251606280261587
0.0011085671743590688
0.019206501702310568
0.00947908716901282
0.004482413437975885
0.01088826412152463
0.01568486198731768
0.0017167838171877226
0.004964892675628112


In [17]:
# array = [[1,2,3],[5,2,3],[5,2,8]]
# columns=['s1','s2','s3','s4','s5','w1','w2','w3','w4','k1','k2','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15']
# predictions = {}
# for index,arr in enumerate(array):
#     predictions[columns[index]] = arr

In [18]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.ensemble import StackingRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# columns=['s1','s2','s3','s4','s5','w1','w2','w3','w4','k1','k2','k4','k5','k6','k7','k8','k9','k10','k11','k12','k13','k14','k15']
# predictions = {}
# estimators = [
#     ('rg', Ridge(best_params[0])),
#     ('rf', RandomForestRegressor(n_estimators=10,random_state=42)),
#     ('gbr', GradientBoostingRegressor(n_estimators=10,random_state=42))
# ]
# reg = StackingRegressor(
#     estimators=estimators,
#     final_estimator=LinearRegression()
# )

# reg.fit(X_train,y_train.iloc[:,0])
# y_pred = reg.predict(X_test)
# predictions[columns[i]] = y_pred
# print(mean_squared_error(y_test.iloc[:,0],y_pred))

In [19]:
# y_pred = reg.predict(X_test)
# # predictions[columns[i]] = y_pred
# print(mean_squared_error(y_test.iloc[:,0],y_pred))

In [20]:
# predictions = [[1,1,1],[2,2,2],[-1,-2,-1]]
df = pd.DataFrame.from_dict(predictions)
df

Unnamed: 0,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,0.164297,0.221659,0.163955,0.158044,0.071763,0.887030,-0.013922,0.127066,0.018735,0.014001,...,-0.000911,0.092032,0.000577,0.059154,-0.004447,-0.003316,-0.009204,0.930198,-0.000729,0.018542
1,0.049518,0.023245,0.563398,0.058521,0.154236,0.527623,0.233937,0.148733,0.101812,0.016389,...,-0.000107,0.070899,0.001486,0.018293,0.008256,0.002893,0.757591,0.024506,0.002577,0.004537
2,0.079898,0.199310,0.313762,0.002850,0.227768,0.537094,0.111735,0.191429,0.119337,0.006224,...,0.000805,0.777443,-0.003624,0.083231,0.001853,0.018645,0.015625,0.022737,0.003461,-0.001053
3,0.004336,-0.010348,1.000478,-0.009143,0.003119,0.896838,0.036147,0.047202,0.019000,0.000774,...,0.000053,0.175934,0.001159,0.186755,0.004714,-0.000208,0.003818,0.037833,-0.000016,0.644209
4,0.028528,0.212469,0.293573,0.071540,0.466527,0.418121,0.210159,0.246056,0.179521,0.086977,...,-0.000066,0.699127,0.000372,0.052103,0.197708,0.006032,0.012906,0.024174,-0.001608,0.001760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42152,0.071693,0.508905,0.084703,0.025965,0.372710,0.833927,0.068629,0.091443,0.009546,-0.006753,...,-0.000040,0.870406,0.001030,0.089093,0.008106,-0.000356,-0.002190,0.008025,0.002432,0.008168
42153,0.034822,0.690430,0.006217,0.038543,0.268835,0.667907,0.031100,0.259277,0.024505,0.003974,...,0.000205,0.023925,0.002569,0.002098,0.002195,0.898257,0.001756,0.006029,0.001105,0.001279
42154,0.011821,0.173590,0.235973,0.453418,0.316387,0.629096,0.202146,0.118389,0.130327,0.008404,...,0.003580,0.701994,0.005151,0.134850,0.012586,0.006784,0.029000,0.007513,0.009686,-0.005308
42155,0.044806,0.021749,0.650109,0.050807,0.101291,0.569037,0.095018,0.095002,0.228248,-0.000264,...,0.001152,0.151333,-0.001437,0.107706,0.005842,-0.001705,0.101141,0.123016,0.891383,-0.003161


In [21]:
## Make the range between [0,1]
df[df > 1] = 1
df[df < 0] = 0
df

Unnamed: 0,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,0.164297,0.221659,0.163955,0.158044,0.071763,0.887030,0.000000,0.127066,0.018735,0.014001,...,0.000000,0.092032,0.000577,0.059154,0.000000,0.000000,0.000000,0.930198,0.000000,0.018542
1,0.049518,0.023245,0.563398,0.058521,0.154236,0.527623,0.233937,0.148733,0.101812,0.016389,...,0.000000,0.070899,0.001486,0.018293,0.008256,0.002893,0.757591,0.024506,0.002577,0.004537
2,0.079898,0.199310,0.313762,0.002850,0.227768,0.537094,0.111735,0.191429,0.119337,0.006224,...,0.000805,0.777443,0.000000,0.083231,0.001853,0.018645,0.015625,0.022737,0.003461,0.000000
3,0.004336,0.000000,1.000000,0.000000,0.003119,0.896838,0.036147,0.047202,0.019000,0.000774,...,0.000053,0.175934,0.001159,0.186755,0.004714,0.000000,0.003818,0.037833,0.000000,0.644209
4,0.028528,0.212469,0.293573,0.071540,0.466527,0.418121,0.210159,0.246056,0.179521,0.086977,...,0.000000,0.699127,0.000372,0.052103,0.197708,0.006032,0.012906,0.024174,0.000000,0.001760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42152,0.071693,0.508905,0.084703,0.025965,0.372710,0.833927,0.068629,0.091443,0.009546,0.000000,...,0.000000,0.870406,0.001030,0.089093,0.008106,0.000000,0.000000,0.008025,0.002432,0.008168
42153,0.034822,0.690430,0.006217,0.038543,0.268835,0.667907,0.031100,0.259277,0.024505,0.003974,...,0.000205,0.023925,0.002569,0.002098,0.002195,0.898257,0.001756,0.006029,0.001105,0.001279
42154,0.011821,0.173590,0.235973,0.453418,0.316387,0.629096,0.202146,0.118389,0.130327,0.008404,...,0.003580,0.701994,0.005151,0.134850,0.012586,0.006784,0.029000,0.007513,0.009686,0.000000
42155,0.044806,0.021749,0.650109,0.050807,0.101291,0.569037,0.095018,0.095002,0.228248,0.000000,...,0.001152,0.151333,0.000000,0.107706,0.005842,0.000000,0.101141,0.123016,0.891383,0.000000


In [22]:
from sklearn.preprocessing import normalize
s_pred = normalize(df.iloc[:,0:5], norm='l1',axis=1, copy=True, return_norm=False)
w_pred = normalize(df.iloc[:,5:9], norm='l1',axis=1, copy=True, return_norm=False)


In [23]:
df1 = pd.DataFrame(s_pred,columns=['s1','s2','s3','s4','s5'])
df2 = pd.DataFrame(w_pred,columns=['w1','w2','w3','w4'])
prediction_df = pd.concat([df1,df2,df.iloc[:,9:]],axis=1)
prediction_df

Unnamed: 0,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,0.210714,0.284281,0.210275,0.202693,0.092037,0.858833,0.000000,0.123027,0.018140,0.014001,...,0.000000,0.092032,0.000577,0.059154,0.000000,0.000000,0.000000,0.930198,0.000000,0.018542
1,0.058331,0.027382,0.663666,0.068936,0.181685,0.521312,0.231139,0.146954,0.100595,0.016389,...,0.000000,0.070899,0.001486,0.018293,0.008256,0.002893,0.757591,0.024506,0.002577,0.004537
2,0.097012,0.242002,0.380970,0.003460,0.276556,0.559709,0.116440,0.199489,0.124362,0.006224,...,0.000805,0.777443,0.000000,0.083231,0.001853,0.018645,0.015625,0.022737,0.003461,0.000000
3,0.004304,0.000000,0.992601,0.000000,0.003096,0.897569,0.036176,0.047240,0.019015,0.000774,...,0.000053,0.175934,0.001159,0.186755,0.004714,0.000000,0.003818,0.037833,0.000000,0.644209
4,0.026596,0.198081,0.273693,0.066696,0.434935,0.396753,0.199419,0.233482,0.170346,0.086977,...,0.000000,0.699127,0.000372,0.052103,0.197708,0.006032,0.012906,0.024174,0.000000,0.001760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42152,0.067382,0.478305,0.079610,0.024404,0.350299,0.830981,0.068387,0.091120,0.009512,0.000000,...,0.000000,0.870406,0.001030,0.089093,0.008106,0.000000,0.000000,0.008025,0.002432,0.008168
42153,0.033520,0.664612,0.005984,0.037102,0.258782,0.679603,0.031645,0.263818,0.024934,0.003974,...,0.000205,0.023925,0.002569,0.002098,0.002195,0.898257,0.001756,0.006029,0.001105,0.001279
42154,0.009924,0.145728,0.198099,0.380643,0.265606,0.582519,0.187179,0.109624,0.120678,0.008404,...,0.003580,0.701994,0.005151,0.134850,0.012586,0.006784,0.029000,0.007513,0.009686,0.000000
42155,0.051575,0.025034,0.748317,0.058482,0.116593,0.576354,0.096240,0.096223,0.231183,0.000000,...,0.001152,0.151333,0.000000,0.107706,0.005842,0.000000,0.101141,0.123016,0.891383,0.000000


In [24]:
prediction_df.to_csv("submission.csv")

In [25]:
value = {}
for i,model in enumerate(models):
    value[columns[i]] = reg.predict(test_data)

pd.DataFrame.from_dict(value)

Unnamed: 0,s1,s2,s3,s4,s5,w1,w2,w3,w4,k1,...,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15
0,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,...,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542,0.018542
1,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,...,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537,0.004537
2,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,...,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053,-0.001053
3,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,...,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209,0.644209
4,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,...,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760,0.001760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42152,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,...,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168,0.008168
42153,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,...,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279,0.001279
42154,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,...,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308,-0.005308
42155,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,...,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161,-0.003161


In [26]:
# reg

In [27]:
# from sklearn.ensemble import RandomForestRegressor
# import time
# rfg = RandomForestRegressor(n_estimators=20,random_state=42)

In [28]:
# start_time = time.time()
# # print(start_time)
# rfg.fit(X_train,y_train.iloc[:,0])
# print("Seconds: ",time.time()-start_time)