In [2]:
import pandas as pd

train_file = "/home/colombelli/temp/applications/ey/data-set/train_complete.csv"
df = pd.read_csv(train_file)
df.head()

Unnamed: 0,tweet,label
0,"""QT @user In the original draft of the 7th boo...",2
1,"""Ben Smith / Smith (concussion) remains out of...",1
2,Sorry bout the stream last night I crashed out...,1
3,Chase Headley's RBI double in the 8th inning o...,1
4,@user Alciato: Bee will invest 150 million in ...,2


## Investigation of the basic dataset properties

In [11]:
print("Number of samples: ", len(df))
print("Labels:")
print(df['label'].value_counts())
print("\nTweet NaN values: ", df['tweet'].isna().sum())

Number of samples:  47615
Labels:
1    21542
2    18668
0     7405
Name: label, dtype: int64

Tweet NaN values:  0


## Data preprocessing 

I will use a baseline approach and compare to a state-of-the-art approach for sentiment analysis. 
The data preprocessing for the TF-IDF approach used with the baseline algorithm is heavier than the preprocessing performed in the data used by BERT.

In [37]:
import re
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer 
from nltk.stem import PorterStemmer


tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                            reduce_len=True)
stemmer = PorterStemmer() 

# There are lots of these functions available on the internet
def text_preprocessing_tfidf(text):

    # Remove @mentions
    text = re.sub(r'(@.*?)[\s]', ' ', text)
    # Remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # Remove hyperlinks
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    # Remove the hash # sign from hashtags
    text = re.sub(r'#', '', text)

    text_clean = []
    for word in tokenizer.tokenize(text):
        if (word not in stopwords.words('english') and  # Remove stopwords
            word not in string.punctuation):  # Remove punctuation

            stem_word = stemmer.stem(word) # happy, happiness, etc -> happi            
            text_clean.append(stem_word)

    return " ".join(text_clean)


# This process can take some time and could be improved
def get_tfidf_preprocessed_dataset(df):
    df['tweet'] = df['tweet'].\
        map(text_preprocessing_tfidf)
    #df.loc[:,'tweet'] = tweet_preprocessed_series
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/colombelli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
preprocessed_df = get_tfidf_preprocessed_dataset(df.iloc[:50, :])
preprocessed_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'] = df['tweet'].\


Unnamed: 0,tweet,label
0,qt origin draft 7th book remu lupin surviv bat...,2
1,ben smith smith concuss remain lineup thursday...,1
2,sorri bout stream last night crash tonight sur...,1
3,chase headley' rbi doubl 8th inning david pric...,1
4,alciato bee invest 150 million januari anoth 2...,2


In [26]:
preprocessed_df

0        qt origin draft 7th book remu lupin surviv bat...
1        ben smith smith concuss remain lineup thursday...
2        sorri bout stream last night crash tonight sur...
3        chase headley' rbi doubl 8th inning david pric...
4        alciato bee invest 150 million januari anoth 2...
                               ...                        
47610    london ap princ georg celebr second birthday w...
47611    harper' worst offens refuge may climat record ...
47612    hold ... sam smith may theme spectr dope 007 s...
47613    gonna watch final destin 5 tonight alway leav ...
47614    interview devon alexand speed kill video tuesd...
Name: tweet, Length: 47615, dtype: object