In [1]:
import pandas as pd 
import numpy as np
import datetime
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 
st = PorterStemmer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
df=pd.read_csv("train_file.csv")
df_test=pd.read_csv("test_file.csv")
# df_test.head(5)
df.head(5)


Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55932 entries, 0 to 55931
Data columns (total 11 columns):
IDLink               55932 non-null object
Title                55932 non-null object
Headline             55932 non-null object
Source               55757 non-null object
Topic                55932 non-null object
PublishDate          55932 non-null object
Facebook             55932 non-null int64
GooglePlus           55932 non-null int64
LinkedIn             55932 non-null int64
SentimentTitle       55932 non-null float64
SentimentHeadline    55932 non-null float64
dtypes: float64(2), int64(3), object(6)
memory usage: 4.7+ MB


In [4]:
df.describe()

Unnamed: 0,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
count,55932.0,55932.0,55932.0,55932.0,55932.0
mean,132.050329,4.551616,14.300132,-0.006318,-0.029577
std,722.931314,21.137177,76.65142,0.137569,0.143038
min,-1.0,-1.0,-1.0,-0.838525,-0.755355
25%,0.0,0.0,0.0,-0.079057,-0.116927
50%,6.0,0.0,0.0,0.0,-0.027277
75%,37.0,2.0,4.0,0.063969,0.057354
max,49211.0,1267.0,3716.0,0.962354,0.964646


In [5]:
#making a table to check for null values in the dataframe and trying to understand which column has the most null values present
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Source,175,0.3
SentimentHeadline,0,0.0
SentimentTitle,0,0.0
LinkedIn,0,0.0
GooglePlus,0,0.0


In [6]:
#Since source is the only column having null values, we shall try to replace it with the mode of the source dataframe 
df.dropna()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.000000,-0.053300
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.425210,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.000000,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.000000,0.141084
...,...,...,...,...,...,...,...,...,...,...,...
55927,jQ3CeLRCb9,Fidel Castro Lashes Out at Obama After Cuba Visit,Retired Cuban leader Fidel Castro slammed Pres...,Wall Street Journal,obama,2016-03-29 01:35:06,794,10,5,-0.135417,-0.055902
55928,akNYeJ8opY,JOHN CRISP | Obama's strategic reaction to Bru...,President Obama caught some predictable flak f...,Kitsap Sun,obama,2016-03-29 01:35:08,0,0,0,0.236228,0.056110
55929,n2DGs0c8IG,Think Trump's 45 Percent Tariffs Are Bad? Try ...,While Trump wants to put large tariffs on impo...,Huffington Post,obama,2016-03-29 01:35:09,102,4,0,0.025747,0.114820
55930,P0EBiaSEjq,Microsoft finally releases giant Surface,Microsoft’s business customers are finally beg...,TechEye,microsoft,2016-03-29 01:38:00,0,0,0,0.000000,-0.028296


In [7]:
#fill it up with the mode value 
# df['Source']=df['Source'].fillna("Bloomberg")
#checking again to see for missing values
df.isna().sum()

IDLink                 0
Title                  0
Headline               0
Source               175
Topic                  0
PublishDate            0
Facebook               0
GooglePlus             0
LinkedIn               0
SentimentTitle         0
SentimentHeadline      0
dtype: int64

In [0]:
#doing the same for the test data set
df_test['Source']=df_test['Source'].fillna("Bloomberg")

In [0]:
#DATA PRE-PROCESSING 

replace_puncts = {'`': "'", '′': "'", '“':'"', '”': '"', '‘': "'"}

strip_chars = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '[', ']', '>', '=', '+', '\\', '•',  '~', '@', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

puncts = ['!', '?', '$', '&', '/', '%', '#', '*','£']

def clean_str(x):
    x = str(x)
    
    x = x.lower()
    
    x = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)
    
    for k, v in replace_puncts.items():
        x = x.replace(k, f' {v} ')
        
    for punct in strip_chars:
        x = x.replace(punct, ' ') 
    
    for punct in puncts:
        x = x.replace(punct, ' ')
        
    x = x.replace(" '", " ")
    x = x.replace("' ", " ")
        
    return x

In [0]:
#TRAIN TITLE 
save=[]
#stop words 
count=1
for i in df['Title']:
  
  # print(i)

  # print()
  word_tokens = word_tokenize(i)
  filtered_sentence = [w for w in word_tokens if not w in stop_words]

  filtered_sentence = []
  store=[]
  for w in word_tokens:
        if w not in stop_words:
          w_lem=st.stem(w)
          filtered_sentence.append(w_lem)
  new_string=" ".join(filtered_sentence)
  # new_string_lem=lemmatizer.lemmatize(new_string)
  i=new_string

  store.append(i)
  save.append(store)





df['new_Title']=save #creating the new dataframe without stop words
#replacing the string obtained by removing the stop words from the original sentence with the the sentence obtained after removing the stop words
df['new_Title'] = df['new_Title'].apply(clean_str)

In [0]:
#TRAIN HEADLINE
save_head=[]
df['Headline'] = df['Headline'].apply(clean_str)

for i in df['Headline']:
  
  # print(i)

  # print()
  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = [w for w in word_tokens_head if not w in stop_words]

  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words:
          w_lem=st.stem(w)
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  # new_string_lem=lemmatizer.lemmatize(new_string)
  i=new_string_head

  store.append(i)
  save_head.append(store)


df['new_headline']=save_head
df['new_headline'] = df['new_headline'].apply(clean_str)  

In [0]:
#TEST HEADLINE
save_head=[]
df_test['Headline'] = df_test['Headline'].apply(clean_str)

for i in df_test['Headline']:

  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = [w for w in word_tokens_head if not w in stop_words]

  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words:
          w_lem=st.stem(w)
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  # new_string_lem=lemmatizer.lemmatize(new_string)
  i=new_string_head

  store.append(i)
  save_head.append(store)


df_test['new_headline']=save_head
df_test['new_headline'] = df_test['new_headline'].apply(clean_str)  

In [0]:
#TEST TITLE 


save_head=[]
df_test['Title'] = df_test['Title'].apply(clean_str)

for i in df_test['Title']:

  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = [w for w in word_tokens_head if not w in stop_words]

  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words:
          w_lem=st.stem(w)
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  # new_string_lem=lemmatizer.lemmatize(new_string)
  i=new_string_head

  store.append(i)
  save_head.append(store)


df_test['new_Title']=save_head
df_test['new_Title'] = df_test['Title'].apply(clean_str)  

In [14]:
#CATEGORICAL TO NUMBERICAL REPRESENTATION OF THE COLUMN TOPIC 

#FOR TRAIN 
save_topic=df["Topic"].unique()
topic_dict={}
count=1
for i in save_topic:
  topic_dict[i]=count
  count=count+1

print(topic_dict)

for i in topic_dict:
  df=df.replace(to_replace=i,value=topic_dict[i])


#FOR TESTING

save_topic_test=df_test["Topic"].unique()
topic_dict_test={}
count=1
for i in save_topic_test:
  topic_dict_test[i]=count
  count=count+1

print(topic_dict_test)

for i in topic_dict_test:
  df_test=df_test.replace(to_replace=i,value=topic_dict_test[i])




{'obama': 1, 'economy': 2, 'microsoft': 3, 'palestine': 4}
{'economy': 1, 'microsoft': 2, 'obama': 3, 'palestine': 4}


In [0]:

#SPLITTING THE DATE AND TIME COLUMNS IN ORDER TO OBTAIN THE HOUR AND DAY FROM IT 


df_day = []
df_test_day = []

for i in df['PublishDate']:
    df_day.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))
    
for i in df_test['PublishDate']:
    df_test_day.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))

df['day'] = df_day
df_test['day'] = df_test_day


# convert weekday to 0-6

df['day'] = df['day'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})
df_test['day'] = df_test['day'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})

In [0]:
df["hour"] = df["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])
df_test["hour"] = df_test["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])

In [17]:
df.columns


Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'Facebook', 'GooglePlus', 'LinkedIn', 'SentimentTitle',
       'SentimentHeadline', 'new_Title', 'new_headline', 'day', 'hour'],
      dtype='object')

In [0]:
mapper_title = DataFrameMapper([
    ('new_Title', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),

], default = False)

In [0]:
mapper_headline = DataFrameMapper([
    ('new_headline', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),
], default = False)

In [0]:
features_title = mapper_title.fit_transform(df)
labels_title =df['SentimentTitle']
test_features_title = mapper_title.transform(df_test)

features_headline = mapper_headline.fit_transform(df)
labels_headline = df['SentimentHeadline']
test_features_headline = mapper_headline.transform(df_test)

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_title, labels_title, test_size=0.25, random_state=42)

In [22]:
estimator = LinearSVR(C=0.1)
#clf_svr_1 = GridSearchCV(estimator, parameters)
estimator.fit(x_train, y_train)
predictions_svr = estimator.predict(x_test)

mae1=mean_absolute_error(y_test,predictions_svr)



In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_headline, labels_headline, test_size=0.25, random_state=42)

In [24]:
estimator1 = LinearSVR(C=0.2)
#clf_svr_1 = GridSearchCV(estimator, parameters)
estimator1.fit(x_train, y_train)
predictions_svr1 = estimator1.predict(x_test)
mae2=mean_absolute_error(y_test,predictions_svr)



In [25]:
Leaderboardscore=max(0,( 1 - ((0.4 * mae1) + (0.6 * mae2))))
print(Leaderboardscore)

0.8813214752694644


In [26]:
estimator.fit(features_title, labels_title)
final_pred_title = estimator.predict(test_features_title)

estimator1.fit(features_headline, labels_headline)
final_pred_headline = estimator.predict(test_features_headline)



In [0]:
final = pd.DataFrame({'IDLink': df_test['IDLink'], 'SentimentTitle': list(final_pred_title), 'SentimentHeadline': list(final_pred_headline)})
final.to_csv('final.csv',  encoding='utf-8', index=False)