In [3]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
def generateDf(path):
    data = []
    with open(path, 'r') as f:
      for line in f.readlines():
        data.append(line[:-1].split(' '))
    return pd.DataFrame(data, columns=["word", "tag"])

# Test single word prediction with XGBoost (English)

In [3]:
df = generateDf("EN/combined.txt")
df = df.dropna()

In [5]:
vect = TfidfVectorizer()
vect.fit(df['word'])
X_tfidf = vect.transform(df['word'])
X_tfidf

<12127x3335 sparse matrix of type '<class 'numpy.float64'>'
	with 10098 stored elements in Compressed Sparse Row format>

In [6]:
le = LabelEncoder()
le.fit(df['tag'])
y = le.transform(df['tag'])

In [10]:
le.classes_

array(['B-ADJP', 'B-ADVP', 'B-CONJP', 'B-INTJ', 'B-NP', 'B-PP', 'B-PRT',
       'B-SBAR', 'B-VP', 'I-ADJP', 'I-ADVP', 'I-CONJP', 'I-INTJ', 'I-NP',
       'I-PP', 'I-SBAR', 'I-VP', 'O'], dtype=object)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

In [18]:
classifier = XGBClassifier()
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(prediction)))

              precision    recall  f1-score   support

      B-ADJP       0.38      0.21      0.27        38
      B-ADVP       0.89      0.34      0.49        97
     B-CONJP       0.00      0.00      0.00         1
      B-INTJ       0.53      0.18      0.27        49
        B-NP       0.85      0.47      0.61       629
        B-PP       0.84      0.82      0.83       182
       B-PRT       0.77      0.92      0.84        25
      B-SBAR       0.62      0.29      0.40        17
        B-VP       0.80      0.42      0.55       280
      I-ADJP       0.00      0.00      0.00         8
      I-ADVP       0.00      0.00      0.00        12
      I-INTJ       0.00      0.00      0.00        18
        I-NP       0.66      0.25      0.36       428
      I-SBAR       0.00      0.00      0.00         1
        I-VP       0.64      0.16      0.25       159
           O       0.33      0.98      0.49       482

    accuracy                           0.51      2426
   macro avg       0.46   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
le.inverse_transform(prediction)

array(['B-VP', 'O', 'O', ..., 'O', 'O', 'O'], dtype=object)

In [23]:
def generateDfofTweets(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_tweets = data_in.split('\n\n')

    split_tweets = []
    for labeled_tweet in labeled_tweets:
      word_tag = labeled_tweet.split('\n')
      tweet_text = []
      tweet_tag = []
      for word_tag_pair in word_tag:
        word_tag_pair = word_tag_pair.split(' ')
        tweet_text.append(word_tag_pair[0])
        tweet_tag.append(word_tag_pair[-1])
      split_tweets.append([" ".join(tweet_text), " ".join(tweet_tag)])

    return pd.DataFrame(split_tweets, columns=["text", "tag"])
df = generateDfofTweets("EN/combined.txt")
df

Unnamed: 0,text,tag
0,RT @shaunproulx : Encore ! @bifnaked for the w...,O O O B-INTJ O O B-PP B-NP I-NP O B-NP O B-NP ...
1,Pick up here . Deliver on Dayton NJ tomorrow (...,B-VP B-PRT B-NP O B-VP B-PP B-NP I-NP B-NP B-P...
2,Cant wait for the ravens game tomorrow .... go...,B-VP I-VP B-PP B-NP I-NP I-NP B-NP O B-VP B-NP...
3,@CELLY32 I went to kolb for a week lol that sh...,O B-NP B-VP B-PP B-NP B-PP B-NP I-NP B-INTJ B-...
4,the day you go shirtless in school is the day ...,B-NP I-NP B-NP B-VP B-ADJP B-PP B-NP B-VP B-NP...
...,...,...
625,@jaredleto ... ive been to 13 30stm concerts n...,O O B-VP I-VP B-PP B-NP I-NP I-NP B-ADVP O B-N...
626,http://bit.ly/aTTQYq When Pepsi to ring usuall...,O B-ADVP B-NP B-VP I-VP I-VP I-VP B-PP O B-VP ...
627,@Jezkwon @kpop_stuff Tweet me a member for pic...,O O B-VP B-NP B-NP I-NP B-PP B-NP O B-VP I-VP ...
628,@BexsterBexster @shinobi32768 I 've been havin...,O O B-NP B-VP I-VP I-VP B-NP B-NP I-NP O B-VP ...


In [25]:
def generateDfofEntities(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_lines = data_in.split('\n')

    output = []
    entity = []
    entity_tag = []
    for line in labeled_lines:
      if line == "":
         continue
      word, tag = line.split(' ')
      if tag=="O":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])
          # reset entity
          entity=[]
          entity_tag=[]
        output.append([word, tag])
        continue

      current_pos, current_tag = tag.split('-')

      if current_pos=="B":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])

        # reset entity
        entity=[word]
        entity_tag=[tag]
        continue

      elif current_pos=="I":
        # add to entity
        entity.append(word)
        entity_tag.append(tag)
        continue

    return pd.DataFrame(output, columns=["entity", "tag"])


# Test entity prediction with XGBoost (English)

In [26]:
df = generateDfofEntities("EN/combined.txt")
df

Unnamed: 0,entity,tag
0,RT,O
1,@shaunproulx,O
2,:,O
3,Encore,B-INTJ
4,!,O
...,...,...
9061,.,O
9062,It,B-NP
9063,'s starting to taste,B-VP I-VP I-VP I-VP
9064,pretty much the same,B-ADJP I-ADJP I-ADJP I-ADJP


In [20]:
vect = TfidfVectorizer()
vect.fit(df['entity'])
X_tfidf = vect.transform(df['entity'])
X_tfidf

<9066x3335 sparse matrix of type '<class 'numpy.float64'>'
	with 10081 stored elements in Compressed Sparse Row format>

In [22]:
le = LabelEncoder()
le.fit(df['tag'])
y = le.transform(df['tag'])
y

array([46, 46, 46, ..., 40,  3, 46])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

In [33]:
classifier = XGBClassifier()
# unable to train the classifier using the train_test split since the number of classes are different (some classes are not present in the training set, leading to an error)
classifier.fit(X_tfidf, y)
# predicting using part of the training data may return an overly optimistic result, but it can still give a gist of the performance of the classifier
prediction = classifier.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(prediction)))

                                              precision    recall  f1-score   support

                                      B-ADJP       0.43      0.11      0.17        28
                               B-ADJP I-ADJP       1.00      0.44      0.62         9
                        B-ADJP I-ADJP I-ADJP       1.00      0.33      0.50         3
                                 B-ADJP I-NP       0.00      0.00      0.00         1
                                      B-ADVP       0.85      0.54      0.66        72
                               B-ADVP I-ADVP       0.83      0.62      0.71         8
                        B-ADVP I-ADVP I-ADVP       1.00      1.00      1.00         1
                               B-ADVP I-INTJ       0.00      0.00      0.00         1
                                      B-INTJ       0.89      0.15      0.26        53
                               B-INTJ I-INTJ       0.00      0.00      0.00        12
                        B-INTJ I-INTJ I-INTJ       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test entity prediction with XGBoost (French)

In [5]:
df_fr = generateDfofEntities("FR/combined.txt")
df_fr

Unnamed: 0,entity,tag
0,Nous,O
1,avons,O
2,tout,O
3,aimé,O
4,.,O
...,...,...
29482,avec,O
29483,des,O
29484,bons,O
29485,conseils,O


In [7]:
le = LabelEncoder()
le.fit(df_fr['tag'])
y = le.transform(df_fr['tag'])
y

array([22, 22, 22, ..., 22, 22, 22])

In [8]:
vect = TfidfVectorizer()
vect.fit(df_fr['entity'])
X_tfidf = vect.transform(df_fr['entity'])
X_tfidf

<29487x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 25046 stored elements in Compressed Sparse Row format>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

In [11]:
classifier = XGBClassifier()
# unable to train the classifier using the train_test split since the number of classes are different (some classes are not present in the training set, leading to an error)
classifier.fit(X_train, y_train)
# predicting using part of the training data may return an overly optimistic result, but it can still give a gist of the performance of the classifier
prediction = classifier.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(prediction)))

                                                        precision    recall  f1-score   support

                                            B-negative       0.35      0.21      0.26       115
                                 B-negative I-negative       0.00      0.00      0.00        11
                      B-negative I-negative I-negative       0.54      0.54      0.54        13
           B-negative I-negative I-negative I-negative       1.00      0.25      0.40         4
B-negative I-negative I-negative I-negative I-negative       0.50      0.50      0.50         2
                                             B-neutral       0.00      0.00      0.00        24
                                   B-neutral I-neutral       0.00      0.00      0.00         2
                         B-neutral I-neutral I-neutral       0.00      0.00      0.00         3
               B-neutral I-neutral I-neutral I-neutral       0.00      0.00      0.00         1
                                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test single word prediction with XGBoost (French)

In [20]:
df_fr = generateDf("FR/combined.txt")
df_fr = df_fr.dropna()
df_fr

Unnamed: 0,word,tag
0,Nous,O
1,avons,O
2,tout,O
3,aimé,O
4,.,O
...,...,...
31893,avec,O
31894,des,O
31895,bons,O
31896,conseils,O


In [21]:
le = LabelEncoder()
le.fit(df_fr['tag'])
y = le.transform(df_fr['tag'])
y

array([6, 6, 6, ..., 6, 6, 6])

In [22]:
vect = TfidfVectorizer()
vect.fit(df_fr['word'])
X_tfidf = vect.transform(df_fr['word'])
X_tfidf

<30035x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 25049 stored elements in Compressed Sparse Row format>

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

In [24]:
classifier = XGBClassifier()
# unable to train the classifier using the train_test split since the number of classes are different (some classes are not present in the training set, leading to an error)
classifier.fit(X_train, y_train)
# predicting using part of the training data may return an overly optimistic result, but it can still give a gist of the performance of the classifier
prediction = classifier.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(prediction)))

              precision    recall  f1-score   support

  B-negative       0.52      0.09      0.15       176
   B-neutral       0.00      0.00      0.00        25
  B-positive       0.49      0.42      0.45       165
  I-negative       0.00      0.00      0.00        46
   I-neutral       0.00      0.00      0.00        15
  I-positive       0.00      0.00      0.00        42
           O       0.94      0.99      0.97      5538

    accuracy                           0.93      6007
   macro avg       0.28      0.21      0.22      6007
weighted avg       0.90      0.93      0.91      6007



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
