# Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Import Data
I am importing both the test and train data so they can undergo the same preprocessing.

In [None]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [None]:
train.head(5)


In [None]:
print(train[['message']].values)

In [None]:
test.head(5)

# Missing Data
I'm going to check if there are any missing data in any of the columns

In [None]:
train.info()

# Balance Sentiment Values

In [None]:
print(train.sentiment.value_counts())
sns.countplot(x='sentiment', data = train)

In [None]:
believe = train[train['sentiment'] == 1]
no_believe = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

no_believe_upsampled = resample(no_believe, replace=True,
                               n_samples=len(believe),
                               random_state=27)
neutral_upsampled = resample(neutral, replace=True,
                               n_samples=len(believe),
                               random_state=27)
news_upsampled = resample(news, replace=True,
                               n_samples=len(believe),
                               random_state=27)

# combine majority and upsamples minority
train_upsampled = pd.concat([believe, no_believe_upsampled, 
                            neutral_upsampled, news_upsampled])
print(train_upsampled.sentiment.value_counts())
sns.countplot(x='sentiment', data = train_upsampled)

# Preprocessing
## Remove noise from both datasets
I'm going to remove all URL's, email addresses, numbers and punctuation for all messages. I will then transfor all messages to lower case.

In [None]:
def remove_punctuation_numbers(post):
    #this function will remove all punction & numbers from a message
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

    
def remove_noise(df):
    clean_df = df.copy()
    #Remove URL's
    pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
    subs_url = ' '
    clean_df['message'] = clean_df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
    
    #Remove emails's
    pattern_url = r'[A-Z0-9a-z._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,64}'
    subs_url = ' '
    clean_df['message'] = clean_df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
    
    # Make Lower case
    clean_df['message'] = clean_df['message'].str.lower()
    
    #Strip out punctuation & numbers
    clean_df['message'] = clean_df['message'].apply(remove_punctuation_numbers)
   
    #Remove Entities
    #pattern_url = r'\s([@][\w_-]+)'
    #subs_url = ' '
    #clean_df['message'] = clean_df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

    #Remove non-standard characters
    pattern_url = r'[^\u0000-\u007F]+'
    subs_url = ' '
    clean_df['message'] = clean_df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
     
    #Remove extra spaces
    pattern_url = r'\r\n|\n|\r|\s{2,}'
    subs_url = ' '
    clean_df['message'] = clean_df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
     
    
    return clean_df

In [None]:
print(train[['message']].values)
train = remove_noise(train)



In [None]:
print(train[['message']].values)

In [None]:
test = remove_noise (test)
print(test.head(15))

# Split training into X & y
Split the data in variabel X for features and Y for the label for training data. Also extract the X features for test data.

In [None]:
y = train['sentiment']
X = train['message']
test_X = test['message']

# Vectorize X & test_X

In [None]:
vectorizer = TfidfVectorizer(strip_accents='ascii',lowercase=True, 
                             analyzer='word', max_df=3000, smooth_idf=True,                             
                             ngram_range=(1,2),min_df=2, stop_words='english')
X_vectorized = vectorizer.fit_transform(X)
testX_vectorized = vectorizer.transform(test_X)

# Split train data into training & validation data

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=.05, shuffle=True, random_state = 11)

# Training the model & evaluation

In [None]:
lr = LogisticRegression(penalty='l2', dual=False,tol=0.001, C = 100.0,
                       fit_intercept=True, intercept_scaling =1,
                       class_weight=None, random_state=11, 
                       solver='lbfgs', max_iter=1000, multi_class='auto',
                       verbose=0,warm_start=False, n_jobs=None, 
                        l1_ratio=None)
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_val)


# Check Model Performance
Checking the performance of model against validation set of the training dataset.

In [None]:
f1_score(y_val,lr_pred, average='macro')

# Fit Model To Test Data & Evaluate

In [None]:
testlr_pred = lr.predict(testX_vectorized)
test['sentiment']=testlr_pred
print(test.head())

# Creating an output csv for submission

In [None]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)