In [1]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def generateDf(path):
    data = []
    with open(path, 'r') as f:
      for line in f.readlines():
        data.append(line[:-1].split(' '))
    return pd.DataFrame(data, columns=["word", "tag"])

In [56]:
def generateFileOut(model, X, output_path, vectorizer, encoder):
    with open(output_path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vectorizer.transform([x]))
        f.write(x + ' ' + str(encoder.inverse_transform(prediction)[0]) + "\n")

In [None]:
def generateFileOutEntity(model, X, output_path, vectorizer, encoder):
    with open(output_path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vectorizer.transform([x]))
        f.write(x + ' ' + str(encoder.inverse_transform(prediction)[0]) + "\n")

# Test single word prediction with XGBoost (English)

In [9]:
df = generateDf("EN/train")
df = df.dropna()
df

Unnamed: 0,word,tag
0,RT,O
1,@shaunproulx,O
2,:,O
3,Encore,B-INTJ
4,!,O
...,...,...
11230,kick,B-VP
11231,off,B-PRT
11232,3p,B-NP
11233,...,O


In [11]:
vect = TfidfVectorizer()
vect.fit(df['word'])
X_tfidf = vect.transform(df['word'])
X_tfidf

<10685x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 8901 stored elements in Compressed Sparse Row format>

In [12]:
le = LabelEncoder()
le.fit(df['tag'])
y = le.transform(df['tag'])

In [13]:
le.classes_

array(['B-ADJP', 'B-ADVP', 'B-CONJP', 'B-INTJ', 'B-NP', 'B-PP', 'B-PRT',
       'B-SBAR', 'B-VP', 'I-ADJP', 'I-ADVP', 'I-CONJP', 'I-INTJ', 'I-NP',
       'I-PP', 'I-SBAR', 'I-VP', 'O'], dtype=object)

In [47]:
test_df = generateDf("EN/dev.out")
test_df

Unnamed: 0,word,tag
0,NO,B-NP
1,Saints,I-NP
2,R,I-NP
3,.,O
4,Buch,B-NP
...,...,...
1515,much,I-ADJP
1516,the,I-ADJP
1517,same,I-ADJP
1518,.,O


In [48]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)

generateFileOut(classifier, test_df['word'], "EN/dev.xgboost.out", vect, le)

In [23]:
def generateDfofTweets(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_tweets = data_in.split('\n\n')

    split_tweets = []
    for labeled_tweet in labeled_tweets:
      word_tag = labeled_tweet.split('\n')
      tweet_text = []
      tweet_tag = []
      for word_tag_pair in word_tag:
        word_tag_pair = word_tag_pair.split(' ')
        tweet_text.append(word_tag_pair[0])
        tweet_tag.append(word_tag_pair[-1])
      split_tweets.append([" ".join(tweet_text), " ".join(tweet_tag)])

    return pd.DataFrame(split_tweets, columns=["text", "tag"])
df = generateDfofTweets("EN/combined.txt")
df

Unnamed: 0,text,tag
0,RT @shaunproulx : Encore ! @bifnaked for the w...,O O O B-INTJ O O B-PP B-NP I-NP O B-NP O B-NP ...
1,Pick up here . Deliver on Dayton NJ tomorrow (...,B-VP B-PRT B-NP O B-VP B-PP B-NP I-NP B-NP B-P...
2,Cant wait for the ravens game tomorrow .... go...,B-VP I-VP B-PP B-NP I-NP I-NP B-NP O B-VP B-NP...
3,@CELLY32 I went to kolb for a week lol that sh...,O B-NP B-VP B-PP B-NP B-PP B-NP I-NP B-INTJ B-...
4,the day you go shirtless in school is the day ...,B-NP I-NP B-NP B-VP B-ADJP B-PP B-NP B-VP B-NP...
...,...,...
625,@jaredleto ... ive been to 13 30stm concerts n...,O O B-VP I-VP B-PP B-NP I-NP I-NP B-ADVP O B-N...
626,http://bit.ly/aTTQYq When Pepsi to ring usuall...,O B-ADVP B-NP B-VP I-VP I-VP I-VP B-PP O B-VP ...
627,@Jezkwon @kpop_stuff Tweet me a member for pic...,O O B-VP B-NP B-NP I-NP B-PP B-NP O B-VP I-VP ...
628,@BexsterBexster @shinobi32768 I 've been havin...,O O B-NP B-VP I-VP I-VP B-NP B-NP I-NP O B-VP ...


In [65]:
def generateDfofEntities(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_lines = data_in.split('\n')

    output = []
    entity = []
    entity_tag = []
    for line in labeled_lines:
      if line == "":
         output.append([])
         continue
      word, tag = line.split(' ')
      if tag=="O":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])
          # reset entity
          entity=[]
          entity_tag=[]
        output.append([word, tag])
        continue

      current_pos, current_tag = tag.split('-')

      if current_pos=="B":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])

        # reset entity
        entity=[word]
        entity_tag=[tag]
        continue

      elif current_pos=="I":
        # add to entity
        entity.append(word)
        entity_tag.append(tag)
        continue

    return pd.DataFrame(output, columns=["entity", "tag"])


# Test entity prediction with XGBoost (English)

In [79]:
df = generateDfofEntities("EN/train")
df = df.dropna()
df

Unnamed: 0,entity,tag
0,RT,O
1,@shaunproulx,O
2,:,O
3,Encore,B-INTJ
4,!,O
...,...,...
8510,kick,B-VP
8511,off,B-PRT
8512,3p,B-NP
8513,...,O


In [80]:
vect = TfidfVectorizer()
vect.fit(df['entity'])
X_tfidf = vect.transform(df['entity'])
X_tfidf

<7965x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 8884 stored elements in Compressed Sparse Row format>

In [81]:
le = LabelEncoder()
le.fit(df['tag'])
y = le.transform(df['tag'])
y

array([45, 45, 45, ..., 14, 45, 45])

In [82]:
test_df = generateDfofEntities("EN/dev.out")
test_df

Unnamed: 0,entity,tag
0,NO Saints R,B-NP I-NP I-NP
1,.,O
2,Buch,B-NP
3,might come,B-VP I-VP
4,back,B-ADVP
...,...,...
1175,'s starting to taste,B-VP I-VP I-VP I-VP
1176,pretty much the same,B-ADJP I-ADJP I-ADJP I-ADJP
1177,.,O
1178,,


In [87]:
def generateFileOutEntity(model, X, path, vect, le):
    with open(path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vect.transform([x]))
        prediction = le.inverse_transform(prediction)[0]
        words = x.split(' ')
        tags = prediction.split(' ')
        for word, tag in zip(words, tags):
          f.write(word + ' ' + tag + "\n")

In [88]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)

generateFileOutEntity(classifier, test_df['entity'], "EN/dev.xgboost.entity.out", vect, le)

# Test entity prediction with XGBoost (French)

In [5]:
df_fr = generateDfofEntities("FR/combined.txt")
df_fr

Unnamed: 0,entity,tag
0,Nous,O
1,avons,O
2,tout,O
3,aimé,O
4,.,O
...,...,...
29482,avec,O
29483,des,O
29484,bons,O
29485,conseils,O


In [7]:
le = LabelEncoder()
le.fit(df_fr['tag'])
y = le.transform(df_fr['tag'])
y

array([22, 22, 22, ..., 22, 22, 22])

In [8]:
vect = TfidfVectorizer()
vect.fit(df_fr['entity'])
X_tfidf = vect.transform(df_fr['entity'])
X_tfidf

<29487x3952 sparse matrix of type '<class 'numpy.float64'>'
	with 25046 stored elements in Compressed Sparse Row format>

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

In [11]:
classifier = XGBClassifier()
# unable to train the classifier using the train_test split since the number of classes are different (some classes are not present in the training set, leading to an error)
classifier.fit(X_train, y_train)
# predicting using part of the training data may return an overly optimistic result, but it can still give a gist of the performance of the classifier
prediction = classifier.predict(X_test)
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(prediction)))

                                                        precision    recall  f1-score   support

                                            B-negative       0.35      0.21      0.26       115
                                 B-negative I-negative       0.00      0.00      0.00        11
                      B-negative I-negative I-negative       0.54      0.54      0.54        13
           B-negative I-negative I-negative I-negative       1.00      0.25      0.40         4
B-negative I-negative I-negative I-negative I-negative       0.50      0.50      0.50         2
                                             B-neutral       0.00      0.00      0.00        24
                                   B-neutral I-neutral       0.00      0.00      0.00         2
                         B-neutral I-neutral I-neutral       0.00      0.00      0.00         3
               B-neutral I-neutral I-neutral I-neutral       0.00      0.00      0.00         1
                                       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Test single word prediction with XGBoost (French)

In [49]:
df_fr = generateDf("FR/train")
df_fr = df_fr.dropna()
df_fr

Unnamed: 0,word,tag
0,Nous,O
1,avons,O
2,tout,O
3,aimé,O
4,.,O
...,...,...
28193,à,O
28194,l,O
28195,écoute,O
28196,!,O


In [54]:
le = LabelEncoder()
le.fit(df_fr['tag'])
y = le.transform(df_fr['tag'])
y

array([6, 6, 6, ..., 6, 6, 6])

In [51]:
vect = TfidfVectorizer()
vect.fit(df_fr['word'])
X_tfidf = vect.transform(df_fr['word'])
X_tfidf

<26567x3679 sparse matrix of type '<class 'numpy.float64'>'
	with 22167 stored elements in Compressed Sparse Row format>

In [53]:
test_df = generateDf("FR/dev.out")
test_df

Unnamed: 0,word,tag
0,Petite,O
1,salle,O
2,ambiance,B-neutral
3,plage,O
4,Oléronaise,O
...,...,...
3695,des,O
3696,bons,O
3697,conseils,O
3698,.,O


In [57]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)

generateFileOut(classifier, test_df['word'], "FR/dev.xgboost.out", vect, le)