In [1]:
import spacy
import pandas as pd
import csv



In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
dataset = pd.read_csv("SemEval2018-Task3/datasets/train/SemEval2018-T3-train-taskB.txt", delimiter='\t', quoting=csv.QUOTE_NONE, error_bad_lines=False)
print(dataset.head())

   Tweet Index  Label                                         Tweet text
0            1      1  Sweet United Nations video. Just in time for C...
1            2      1  @mrdahl87 We are rumored to have talked to Erv...
2            3      1  Hey there! Nice to see you Minnesota/ND Winter...
3            4      0                3 episodes left I'm dying over here
4            5      2  "I can't breathe!" was chosen as the most nota...


# A) Linguistic analysis using spaCy 

In [24]:
from collections import Counter

num_words = 0
num_tokens = 0
total_word_length = 0 
types_list=[]
POS_frequencies_coarse = Counter()
POS_frequencies_fine = Counter()

token_count_dict = {}

for index, tweet in dataset['Tweet text'].iteritems():
    
    doc = nlp(tweet)

    for sentence in doc.sents:
        POS_tags_coarse = []
        POS_tags_fine = []

        for token in sentence: 
            num_tokens += 1
            if token.text not in types_list:
                types_list.append(token.text)
           
            # Let's filter out punctuation
            if not token.is_punct:
                num_words += 1
                total_word_length += len(token)
                
                POS_tags_coarse.append(token.pos_)
                POS_tags_fine.append(token.tag_)

                if token.text not in token_count_dict:
                    token_count_dict[token.text] = {}
                    token_count_dict[token.text]['count'] = 0
                    token_count_dict[token.text]['POS_tag_fine'] = token.tag_
                    token_count_dict[token.text]['POS_tag_coarse'] = token.pos_


                token_count_dict[token.text]['count'] += 1
                
        POS_frequencies_coarse.update(POS_tags_coarse)
        POS_frequencies_fine.update(POS_tags_fine)   

## Tokenization

In [25]:
print('Number of tokens: ', num_tokens)
print('Number of types: ', len(types_list))
print('Number of words: ', num_words)

print('Average number of words per tweet: ',num_words/len(dataset))
print('Average word length: ', total_word_length/num_words)


Number of tokens:  66518
Number of types:  14957
Number of words:  55221
Average number of words per tweet:  14.402973395931143
Average word length:  5.213415186251607


## POS Tagging

In [26]:
word_frequency_df = pd.DataFrame.from_dict(token_count_dict, orient='index').reset_index()

In [27]:
common_tags = POS_frequencies_fine.most_common(10)
print(common_tags)


[('NN', 9254), ('NNP', 5133), ('IN', 4728), ('DT', 3706), ('PRP', 3624), ('RB', 3519), ('JJ', 3446), ('VB', 2909), ('NNS', 2521), ('VBP', 2113)]


In [28]:
print(word_frequency_df.reset_index().columns)

Index(['level_0', 'index', 'count', 'POS_tag_fine', 'POS_tag_coarse'], dtype='object')


In [30]:
for tag in list(zip(*common_tags))[0]:
    print(tag)
    df_tag = word_frequency_df.loc[word_frequency_df['POS_tag_fine']==tag].sort_values(by='count', ascending=False)
    print(df_tag.iloc[:3])
    print(df_tag.iloc[-1])

NN
     index  count POS_tag_fine POS_tag_coarse
105    day    147           NN           NOUN
6     time     94           NN           NOUN
879  today     86           NN           NOUN
index             http://t.co/E189iHBpZr
count                                  1
POS_tag_fine                          NN
POS_tag_coarse                      NOUN
Name: 14902, dtype: object
NNP
          index  count POS_tag_fine POS_tag_coarse
8     Christmas     98          NNP          PROPN
926          RT     51          NNP          PROPN
2524        New     27          NNP          PROPN
index             Motion
count                  1
POS_tag_fine         NNP
POS_tag_coarse     PROPN
Name: 14900, dtype: object
IN
   index  count POS_tag_fine POS_tag_coarse
5     in    585           IN            ADP
60    of    582           IN            ADP
7    for    501           IN            ADP
index             http://t.co/wsFo2Dlu7h
count                                  1
POS_tag_fine              

## Lemmatization

In [None]:
for index, tweet in dataset['Tweet text'].iloc[:100].iteritems():
    doc = nlp(tweet)

    for sentence in doc.sents:
        for token in sentence: 
            if token.text != token.lemma_:
                print(token.text, token.lemma_, sentence)

## Named Entity Recognition 


In [None]:
NER_frequencies = Counter()

for index, tweet in dataset['Tweet text'].iteritems():
    doc = nlp(tweet)

    NER_list = []
    for ent in doc.ents:
        NER_list.append(ent.label_)
    
    NER_frequencies.update(NER_list)

    

In [None]:
print(NER_frequencies)

In [None]:
print('Number of named entities: ', len(NER_frequencies))
print('Number of different entity labels: ', sum(NER_frequencies.values()))


In [None]:
from spacy import displacy

for index, tweet in dataset['Tweet text'].iloc[:3].iteritems():
    doc=nlp(tweet)
    displacy.render(doc, jupyter=True, style='ent')