In [0]:
#IMPORT ALL THE NECESSARY LIBRARIES

import pandas as pd 
import numpy as np
import datetime
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
import warnings
warnings.filterwarnings("ignore")

In [2]:
#INSTALLING ALL THE REQUIRED DEPEDANCIES OF NLTK NEEDED FOR OUR PROGRAM 
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
#CREATING AN OBJECT FOR STEMMING
st = PorterStemmer()

In [0]:
#CREATING AN OBJECT FOR LEMMATIZATION 
lemmatizer = WordNetLemmatizer() 

In [0]:
#CREATING A SET OF ALL STOP WORDS
stop_words = set(stopwords.words('english'))

In [6]:
#READING THE TRAIN AND TEST FILES
df = pd.read_csv("train_file.csv")
df_test = pd.read_csv("test_file.csv")

#DISPLAYING THE FIRST 5 ROWS OF THE TRAIN FILE 
df.head(5)


Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [7]:
#OBTAINING THE INFORMATION ABOUT THE VARIOUS COLUMNS OF THE DATASET

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55932 entries, 0 to 55931
Data columns (total 11 columns):
IDLink               55932 non-null object
Title                55932 non-null object
Headline             55932 non-null object
Source               55757 non-null object
Topic                55932 non-null object
PublishDate          55932 non-null object
Facebook             55932 non-null int64
GooglePlus           55932 non-null int64
LinkedIn             55932 non-null int64
SentimentTitle       55932 non-null float64
SentimentHeadline    55932 non-null float64
dtypes: float64(2), int64(3), object(6)
memory usage: 4.7+ MB


In [8]:
#UNDERSTANING THE VARIOUS FEATURES OF THE DATASET SUCH AS THE MEAN, MEDIAN AND MODE

df.describe()

Unnamed: 0,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
count,55932.0,55932.0,55932.0,55932.0,55932.0
mean,132.050329,4.551616,14.300132,-0.006318,-0.029577
std,722.931314,21.137177,76.65142,0.137569,0.143038
min,-1.0,-1.0,-1.0,-0.838525,-0.755355
25%,0.0,0.0,0.0,-0.079057,-0.116927
50%,6.0,0.0,0.0,0.0,-0.027277
75%,37.0,2.0,4.0,0.063969,0.057354
max,49211.0,1267.0,3716.0,0.962354,0.964646


In [9]:
#CREATING A TABLE IN ORDER TO UNDERSTAND THE WHICH COLUMNS HAVE NULL VALUES IN THEM
#ARRANGING THE VALUES IN DECENDING ORDER IN ORDER TO GET A FAIR IDEA OF THE COLUMN WITH THE MOST NULL VALUES

total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

Unnamed: 0,Total,%
Source,175,0.3
SentimentHeadline,0,0.0
SentimentTitle,0,0.0
LinkedIn,0,0.0
GooglePlus,0,0.0
Facebook,0,0.0
PublishDate,0,0.0
Topic,0,0.0
Headline,0,0.0
Title,0,0.0


In [10]:
#FINDING THE MODE OF THE SOURCE COLUMN 
df['Source'].mode()

0    Bloomberg
dtype: object

In [0]:
#REPLACING THE NULL VALUES WITH THE MODE VALUE IN BOTH TRAIN AND TEST DATASET

df['Source']=df['Source'].fillna("Bloomberg")
df_test['Source']=df_test['Source'].fillna("Bloomberg")

In [0]:
#DATA PRE-PROCESSING PART

replace_puncts = {'`': "'", '′': "'", '“':'"', '”': '"', '‘': "'"}

strip_chars = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '[', ']', '>', '=', '+', '\\', '•',  '~', '@', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

puncts = ['!', '?', '$', '&', '/', '%', '#', '*','£']

#FUNCTION FOR CLEANING THE STRING PASSED TO IT 
def clean_str(x):
    x = str(x)
    
    #CONVERTING ALL THE VALUES TO LOWERCASE 
    x = x.lower()
    
    x = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9]\.[^\s]{2,})", "url", x)
    
    for k, v in replace_puncts.items():
        x = x.replace(k, f' {v} ')
        
    for punct in strip_chars:
        x = x.replace(punct, ' ') 
    
    for punct in puncts:
        x = x.replace(punct, ' ')
        
    x = x.replace(" '", " ")
    x = x.replace("' ", " ")
        
    return x

In [0]:
#CLEANING THE TITLE COLUMN OF THE TRAINING DATASET

save=[] 
count=1
for i in df['Title']:
  word_tokens = word_tokenize(i)
  filtered_sentence = []
  store=[]
  for w in word_tokens:
        if w not in stop_words: #REMOVAL OF STOP WORDS
          w_lem=st.stem(w) #LEMMATIZING THE WORDS
          filtered_sentence.append(w_lem) 
  new_string=" ".join(filtered_sentence)
  i=new_string
  store.append(i)
  save.append(store)

#CREATING A NEW COLUMN IN THE DATAFRAME WHICH IS THE CLEANED VERSION OF THE TITLE COLUMN 
df['new_Title']=save 
df['new_Title'] = df['new_Title'].apply(clean_str)

In [0]:
#CLEANING THE HEADLINE COLUMN OF THE TRAINING DATASET
save_head=[]
df['Headline'] = df['Headline'].apply(clean_str)

for i in df['Headline']:
  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words: #REMOVAL OF STOP WORDS
          w_lem=st.stem(w) #LEMMATIZING THE WORDS
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  i=new_string_head
  store.append(i)
  save_head.append(store)


#CREATING A NEW COLUMN IN THE DATAFRAME WHICH IS THE CLEANED VERSION OF THE HEADLINE COLUMN 

df['new_headline']=save_head
df['new_headline'] = df['new_headline'].apply(clean_str)  

In [0]:
#CLEANING THE HEADLINE COLUMN OF THE TEST DATASET
save_head=[]
df_test['Headline'] = df_test['Headline'].apply(clean_str)

for i in df_test['Headline']:
  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words: #REMOVAL OF STOP WORDS
          w_lem=st.stem(w) #LEMMATIZING THE WORDS
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  i=new_string_head
  store.append(i)
  save_head.append(store)

#CREATING A NEW COLUMN IN THE DATAFRAME WHICH IS THE CLEANED VERSION OF THE HEADLINE COLUMN 

df_test['new_headline']=save_head
df_test['new_headline'] = df_test['new_headline'].apply(clean_str)  

In [0]:
#CLEANING THE TITLE COLUMN OF THE TEST DATASET

save_head=[]
df_test['Title'] = df_test['Title'].apply(clean_str)

for i in df_test['Title']:

  word_tokens_head = word_tokenize(i)
  filtered_sentence_head = [w for w in word_tokens_head if not w in stop_words]

  filtered_sentence_head = []
  store=[]
  for w in word_tokens_head:
        if w not in stop_words: #REMOVAL OF STOP WORDS
          w_lem=st.stem(w) #LEMMATIZING THE WORDS
          filtered_sentence_head.append(w_lem)
  new_string_head=" ".join(filtered_sentence_head)
  # new_string_lem=lemmatizer.lemmatize(new_string)
  i=new_string_head

  store.append(i)
  save_head.append(store)

#CREATING A NEW COLUMN IN THE DATAFRAME WHICH IS THE CLEANED VERSION OF THE TITLE COLUMN 
df_test['new_Title']=save_head
df_test['new_Title'] = df_test['Title'].apply(clean_str)  

In [17]:
#CATEGORICAL TO NUMBERICAL CONVERSION OF THE COLUMN TOPIC 

#FOR TRAIN DATASET
save_topic=df["Topic"].unique()
topic_dict={}
count=1
for i in save_topic:
  topic_dict[i]=count
  count=count+1

print(topic_dict)

#REPLACING
for i in topic_dict:
  df=df.replace(to_replace=i,value=topic_dict[i])

#FOR TESTING DATASET

save_topic_test=df_test["Topic"].unique()
topic_dict_test={}
count=1
for i in save_topic_test:
  topic_dict_test[i]=count
  count=count+1

print(topic_dict_test)

#REPLACING
for i in topic_dict_test:
  df_test=df_test.replace(to_replace=i,value=topic_dict_test[i])

{'obama': 1, 'economy': 2, 'microsoft': 3, 'palestine': 4}
{'economy': 1, 'microsoft': 2, 'obama': 3, 'palestine': 4}


In [0]:
#SPLITTING THE DATE AND TIME COLUMNS IN ORDER TO OBTAIN THE HOUR AND DAY FROM IT 

df_day = []
df_test_day = []

for i in df['PublishDate']:
    df_day.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))
    
for i in df_test['PublishDate']:
    df_test_day.append(datetime.datetime.strptime(i, "%Y-%m-%d %H:%M:%S").strftime("%A"))

df['day'] = df_day
df_test['day'] = df_test_day

In [0]:
#MAPPING THE DAYS OF THE WEEK TO NUMERIC VALUES 

#FOR TRAINING DATASET
df['day'] = df['day'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})

#FOR TESTING DATASET
df_test['day'] = df_test['day'].map({'Monday': 0,
                                        'Tuesday': 1,
                                        'Wednesday': 2,
                                        'Thursday': 3,
                                        'Friday': 4,
                                        'Saturday': 5,
                                        'Sunday': 6})

In [0]:
#EXTRACTING THE HOUR FROM THE PUBLISH DATE COLUMN

df["hour"] = df["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])
df_test["hour"] = df_test["PublishDate"].apply(lambda x: x.split()[1].split(':')[0])

In [21]:
#CHECKING FOR ALL THE NEW COLUMNS CREATED

df.columns


Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'Facebook', 'GooglePlus', 'LinkedIn', 'SentimentTitle',
       'SentimentHeadline', 'new_Title', 'new_headline', 'day', 'hour'],
      dtype='object')

In [0]:
#CREATING A MAPPING FUNCTION WHERE THE NEW_TITLE IS MAPPED WITH TFIDF MODLE TO OBTAIN THE NUMERIC VALUE OF THE STRING
#THE REST VALUES ARE MENTIONED AS IT IS SINCE THEY ARE IN THEIR NUMBERIC FORMAT

mapper_title = DataFrameMapper([
    ('new_Title', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),

], default = False)




In [0]:
#CREATING A MAPPING FUNCTION WHERE THE NEW_TITLE IS MAPPED WITH TFIDF MODLE TO OBTAIN THE NUMERIC VALUE OF THE STRING
#THE REST VALUES ARE MENTIONED AS IT IS SINCE THEY ARE IN THEIR NUMBERIC FORMAT

mapper_headline = DataFrameMapper([
    ('new_headline', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),
], default = False)

In [0]:
#OBTAINING THE RELEVANT DATA NEEDED FOR TRAINING-TESTING DATA AND ALSO FOR PREDICTION PURPOSES 

#FOR TITLE
features_title = mapper_title.fit_transform(df)
labels_title =df['SentimentTitle']
test_features_title = mapper_title.transform(df_test)

#FOR HEADLINES
features_headline = mapper_headline.fit_transform(df)
labels_headline = df['SentimentHeadline']
test_features_headline = mapper_headline.transform(df_test)

In [0]:
#SPLITTING THE DATA INTO TESTING AND TRAINING DATA FOR TITLE 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features_title, labels_title, test_size=0.25, random_state=42)

In [0]:
#APPLYING VARIOUS ALGORITHMS TO CHECK THE EFFICIENCY 

In [27]:
#LINEAR SVR 

estimator = LinearSVR(C=0.1)
estimator.fit(X_train,Y_train)
predictions_svr = estimator.predict(X_test)
mae1=mean_absolute_error(Y_test,predictions_svr)
print(mae1)

0.08495550229224834


In [28]:
#LASSO REGRESSION 

clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, Y_train)
Y_prediction = clf.predict(X_test)
mae1forLASSO=mean_absolute_error(Y_test,Y_prediction)
print(mae1forLASSO)

0.0975483356133821


In [29]:
#RIDGE REGRESSION 

clf1_t = linear_model.Ridge(alpha=1.0)
clf1_t.fit(X_train, Y_train)
Y_prediction = clf1_t.predict(X_test)
mae1forRIDGE=mean_absolute_error(Y_test,Y_prediction)
print(mae1forRIDGE)

0.08103896845801914


In [30]:
#ELASTIC NET 

regr = ElasticNet(random_state=0)
regr.fit(X_train, Y_train)
Y_prediction = regr.predict(X_test)
mae1forEN=mean_absolute_error(Y_test,Y_prediction)
print(mae1forEN)

0.09754924503260368


In [31]:
#DISPLAYING THE RESULTS OBTAINED FOR THE MAE METRIC BY CARRYING OUR VARIOUS ALGORITHMS 

results = pd.DataFrame({
    'Model': ['LINEAR SVR', 'LASSO', 'RIDGE', 
              'ELASTIC NET'],
    'Score': [mae1, mae1forLASSO, mae1forRIDGE, 
              mae1forEN]})
result_df = results.sort_values(by='Score', ascending=True)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.081039,RIDGE
0.084956,LINEAR SVR
0.097548,LASSO
0.097549,ELASTIC NET


In [32]:
#FINDING THE MINIMUM VALUE OF THE MAE FROM ALL THE AVAILABLE VALUES FOR TITLE 
min_val_x=results['Score'].min()
print(min_val_x)

0.08103896845801914


In [0]:
#SPLITTING THE DATA INTO TESTING AND TRAINING DATA FOR HEADLINES 

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features_headline, labels_headline, test_size=0.25, random_state=42)

In [0]:
#LINEAR SVR 

estimator1 = LinearSVR(C=0.2)
estimator1.fit(X_train, Y_train)
predictions_svr1 = estimator1.predict(X_test)
mae2=mean_absolute_error(Y_test,predictions_svr)
print(mae2)

In [0]:
#LASSO REGRESSION 

clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, Y_train)
Y_prediction = clf.predict(X_test)
mae2forLASSO=mean_absolute_error(Y_test,Y_prediction)
print(mae2forLASSO)

In [0]:
#RIDGE REGRESSION 

clf1 = linear_model.Ridge(alpha=1.0)
clf1.fit(X_train, Y_train)
Y_prediction = clf1.predict(X_test)
mae2forRIDGE=mean_absolute_error(Y_test,Y_prediction)
print(mae2forRIDGE)

In [0]:
#ELASTIC NET 

regr = ElasticNet(random_state=0)
regr.fit(X_train, Y_train)
Y_prediction = regr.predict(X_test)
mae2forEN=mean_absolute_error(Y_test,Y_prediction)
print(mae2forEN)

In [0]:
#DISPLAYING THE RESULTS OBTAINED FOR THE MAE METRIC BY CARRYING OUR VARIOUS ALGORITHMS 

results1 = pd.DataFrame({
    'Model': ['LINEAR SVR', 'LASSO', 'RIDGE', 
              'ELASTIC NET'],
    'Score': [mae2, mae2forLASSO, mae2forRIDGE, 
              mae2forEN]})
result1_df = results1.sort_values(by='Score', ascending=True)
result1_df = result1_df.set_index('Score')
result1_df

In [0]:
#FINDING THE MINIMUM VALUE OF THE MAE FROM ALL THE AVAILABLE VALUES FOR HEADLINES
min_val=results1['Score'].min()
print(min_val)

In [0]:
#CALCULATING THE LEADERBOARD SCORE BY THE HELP OF THE GIVEN FORMULA 

Leaderboardscore=max(0,(1 - ((0.4 * min_val_x) + (0.6 * min_val))))
print(Leaderboardscore)

In [0]:
#FINDING THE PREDICTED VALUES BY CONSIDERING THE MOST OPTIMUM ALGO, HERE RIDGE REGRESSION 

#FOR TITLE
clf1_t.fit(features_title, labels_title)
final_pred_title = clf1_t.predict(test_features_title)

#FOR HEADLINE
clf1.fit(features_headline, labels_headline)
final_pred_headline = clf1.predict(test_features_headline)

In [0]:
#GETTING THE ID FOR THE NEWS, THE TITLE SENTIMENT PREDICTED , THE HEADLINE SENTIMENT PREDICTED
final = pd.DataFrame({'IDLink': df_test['IDLink'], 'SentimentTitle': list(final_pred_title), 'SentimentHeadline': list(final_pred_headline)})

#COVERTING THE DATAFRAME TO CSV FORMAT FOR SUBMISSION
final.to_csv('final-RIDGE.csv',  encoding='utf-8', index=False)