In [1]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def generateDf(path):
    data = []
    with open(path, 'r') as f:
      for line in f.readlines():
        data.append(line[:-1].split(' '))
    return pd.DataFrame(data, columns=["word", "tag"])

In [3]:
def generateFileOut(model, X, output_path, vectorizer, encoder):
    with open(output_path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vectorizer.transform([x]))
        f.write(x + ' ' + str(encoder.inverse_transform(prediction)[0]) + "\n")

In [4]:
def generateFileOutEntity(model, X, output_path, vectorizer, encoder):
    with open(output_path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vectorizer.transform([x]))
        f.write(x + ' ' + str(encoder.inverse_transform(prediction)[0]) + "\n")

In [5]:
def generateDfofEntities(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_lines = data_in.split('\n')

    output = []
    entity = []
    entity_tag = []
    for line in labeled_lines:
      if line == "":
         output.append([])
         continue
      word, tag = line.split(' ')
      if tag=="O":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])
          # reset entity
          entity=[]
          entity_tag=[]
        output.append([word, tag])
        continue

      current_pos, current_tag = tag.split('-')

      if current_pos=="B":
        # check if previous entity is complete
        if len(entity)>0:
          output.append([" ".join(entity), " ".join(entity_tag)])

        # reset entity
        entity=[word]
        entity_tag=[tag]
        continue

      elif current_pos=="I":
        # add to entity
        entity.append(word)
        entity_tag.append(tag)
        continue

    return pd.DataFrame(output, columns=["entity", "tag"])


In [6]:
def generateDfofTweets(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_tweets = data_in.split('\n\n')

    split_tweets = []
    for labeled_tweet in labeled_tweets:
      word_tag = labeled_tweet.split('\n')
      tweet_text = []
      tweet_tag = []
      for word_tag_pair in word_tag:
        word_tag_pair = word_tag_pair.split(' ')
        tweet_text.append(word_tag_pair[0])
        tweet_tag.append(word_tag_pair[-1])
      split_tweets.append([" ".join(tweet_text), " ".join(tweet_tag)])

    return pd.DataFrame(split_tweets, columns=["text", "tag"])
df = generateDfofTweets("EN/combined.txt")
df

Unnamed: 0,text,tag
0,RT @shaunproulx : Encore ! @bifnaked for the w...,O O O B-INTJ O O B-PP B-NP I-NP O B-NP O B-NP ...
1,Pick up here . Deliver on Dayton NJ tomorrow (...,B-VP B-PRT B-NP O B-VP B-PP B-NP I-NP B-NP B-P...
2,Cant wait for the ravens game tomorrow .... go...,B-VP I-VP B-PP B-NP I-NP I-NP B-NP O B-VP B-NP...
3,@CELLY32 I went to kolb for a week lol that sh...,O B-NP B-VP B-PP B-NP B-PP B-NP I-NP B-INTJ B-...
4,the day you go shirtless in school is the day ...,B-NP I-NP B-NP B-VP B-ADJP B-PP B-NP B-VP B-NP...
...,...,...
625,@jaredleto ... ive been to 13 30stm concerts n...,O O B-VP I-VP B-PP B-NP I-NP I-NP B-ADVP O B-N...
626,http://bit.ly/aTTQYq When Pepsi to ring usuall...,O B-ADVP B-NP B-VP I-VP I-VP I-VP B-PP O B-VP ...
627,@Jezkwon @kpop_stuff Tweet me a member for pic...,O O B-VP B-NP B-NP I-NP B-PP B-NP O B-VP I-VP ...
628,@BexsterBexster @shinobi32768 I 've been havin...,O O B-NP B-VP I-VP I-VP B-NP B-NP I-NP O B-VP ...


In [7]:
def generateFileOutEntity(model, X, path, vect, le):
    with open(path, 'w') as f:
      for x in X:
        if x == None or x == '':
          f.write("\n")
          continue
        prediction = model.predict(vect.transform([x]))
        prediction = le.inverse_transform(prediction)[0]
        words = x.split(' ')
        tags = prediction.split(' ')
        for word, tag in zip(words, tags):
          f.write(word + ' ' + tag + "\n")

In [8]:
# extract bigrams from the training data
def generateBigramDf(path):
    with open(path, 'r') as f:
      data_in= f.read()
    labeled_lines = data_in.split('\n')

    output = []
    previous_word = None
    previous_tag = None
    for line in labeled_lines:
        if line == "" or line == None:
            previous_word = None
            previous_tag = None
            output.append([])
            continue
        word, tag = line.split(' ')
        if previous_word!=None:
            output.append([previous_word + " " + word, previous_tag + " " + tag])
        # else:
        #     output.append([])
        previous_word = word
        previous_tag = tag

    return pd.DataFrame(output, columns=["bigram", "tag"])

In [9]:
def generateFileOutBigram(model, X, path, vect, le):
    with open(path, 'w') as f:
      previous_word = None
      for x in X:
        if x == None or x == '':
          previous_word = None
          f.write("\n")
          continue
        prediction = model.predict(vect.transform([x]))
        prediction = le.inverse_transform(prediction)[0]
        if previous_word == None:
          words = x.split(' ')
          tags = prediction.split(' ')
          for word, tag in zip(words, tags):
            f.write(word + ' ' + tag + "\n")
          previous_word = words[1]
        else:
          words = x.split(' ')
          tags = prediction.split(' ')
          f.write(words[1] + ' ' + tags[1] + "\n")
          previous_word = words[1]

# Test single word prediction with XGBoost (English)

In [10]:
df = generateDf("EN/train")
df = df.dropna()
df

Unnamed: 0,word,tag
0,RT,O
1,@shaunproulx,O
2,:,O
3,Encore,B-INTJ
4,!,O
...,...,...
11230,kick,B-VP
11231,off,B-PRT
11232,3p,B-NP
11233,...,O


In [11]:
vect = TfidfVectorizer()
vect.fit(df['word'])
X_tfidf = vect.transform(df['word'])
X_tfidf

<10685x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 8901 stored elements in Compressed Sparse Row format>

In [12]:
le = LabelEncoder()
le.fit(df['tag'])
y = le.transform(df['tag'])

In [13]:
le.classes_

array(['B-ADJP', 'B-ADVP', 'B-CONJP', 'B-INTJ', 'B-NP', 'B-PP', 'B-PRT',
       'B-SBAR', 'B-VP', 'I-ADJP', 'I-ADVP', 'I-CONJP', 'I-INTJ', 'I-NP',
       'I-PP', 'I-SBAR', 'I-VP', 'O'], dtype=object)

In [14]:
test_df = generateDf("EN/dev.out")
test_df

Unnamed: 0,word,tag
0,NO,B-NP
1,Saints,I-NP
2,R,I-NP
3,.,O
4,Buch,B-NP
...,...,...
1515,much,I-ADJP
1516,the,I-ADJP
1517,same,I-ADJP
1518,.,O


In [15]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)

generateFileOut(classifier, test_df['word'], "EN/dev.xgboost.out", vect, le)

# Test single word prediction with XGBoost (French)

In [16]:
df_fr = generateDf("FR/train")
df_fr = df_fr.dropna()
df_fr

Unnamed: 0,word,tag
0,Nous,O
1,avons,O
2,tout,O
3,aimÃ©,O
4,.,O
...,...,...
28193,Ã,O
28194,l,O
28195,Ã©coute,O
28196,!,O


In [17]:
le = LabelEncoder()
le.fit(df_fr['tag'])
y = le.transform(df_fr['tag'])
y

array([6, 6, 6, ..., 6, 6, 6])

In [18]:
vect = TfidfVectorizer()
vect.fit(df_fr['word'])
X_tfidf = vect.transform(df_fr['word'])
X_tfidf

<26567x3536 sparse matrix of type '<class 'numpy.float64'>'
	with 23345 stored elements in Compressed Sparse Row format>

In [19]:
test_df = generateDf("FR/dev.out")
test_df

Unnamed: 0,word,tag
0,Petite,O
1,salle,O
2,ambiance,B-neutral
3,plage,O
4,OlÃ©ronaise,O
...,...,...
3695,des,O
3696,bons,O
3697,conseils,O
3698,.,O


In [20]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)

generateFileOut(classifier, test_df['word'], "FR/dev.xgboost.out", vect, le)

# Test out English Bigram

In [21]:
df_bigram = generateBigramDf("EN/train")
df_bigram = df_bigram.dropna()
df_bigram

Unnamed: 0,bigram,tag
0,RT @shaunproulx,O O
1,@shaunproulx :,O O
2,: Encore,O B-INTJ
3,Encore !,B-INTJ O
4,! @bifnaked,O O
...,...,...
10679,", kick",O B-VP
10680,kick off,B-VP B-PRT
10681,off 3p,B-PRT B-NP
10682,3p ...,B-NP O


In [22]:
le = LabelEncoder()
le.fit(df_bigram['tag'])
y = le.transform(df_bigram['tag'])
y

array([129, 129, 121, ...,  51,  39, 129])

In [23]:
vect = TfidfVectorizer()
vect.fit(df_bigram['bigram'])
X_tfidf = vect.transform(df_bigram['bigram'])
X_tfidf

<10134x3026 sparse matrix of type '<class 'numpy.float64'>'
	with 16777 stored elements in Compressed Sparse Row format>

In [24]:
test_df = generateBigramDf("EN/dev.out")
test_df

Unnamed: 0,bigram,tag
0,NO Saints,B-NP I-NP
1,Saints R,I-NP I-NP
2,R .,I-NP O
3,. Buch,O B-NP
4,Buch might,B-NP B-VP
...,...,...
1438,much the,I-ADJP I-ADJP
1439,the same,I-ADJP I-ADJP
1440,same .,I-ADJP O
1441,,


In [25]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)
generateFileOutBigram(classifier, test_df['bigram'], "EN/dev.xgboost.bigram.out", vect, le)

# Test french bigram

In [26]:
df_bigram = generateBigramDf("FR/train")
df_bigram = df_bigram.dropna()
df_bigram

Unnamed: 0,bigram,tag
0,Nous avons,O O
1,avons tout,O O
2,tout aimÃ©,O O
3,aimÃ© .,O O
5,Le foi,O B-positive
...,...,...
26561,trÃ¨s Ã,O O
26562,Ã l,O O
26563,l Ã©coute,O O
26564,Ã©coute !,O O


In [27]:
le = LabelEncoder()
le.fit(df_bigram['tag'])
y = le.transform(df_bigram['tag'])
y

array([16, 16, 16, ..., 16, 16, 16])

In [28]:
vect = TfidfVectorizer()
vect.fit(df_bigram['bigram'])
X_tfidf = vect.transform(df_bigram['bigram'])
X_tfidf

<24935x3535 sparse matrix of type '<class 'numpy.float64'>'
	with 44972 stored elements in Compressed Sparse Row format>

In [29]:
test_df = generateBigramDf("FR/dev.out")
test_df

Unnamed: 0,bigram,tag
0,Petite salle,O O
1,salle ambiance,O B-neutral
2,ambiance plage,B-neutral O
3,plage OlÃ©ronaise,O O
4,OlÃ©ronaise .,O O
...,...,...
3464,des bons,O O
3465,bons conseils,O O
3466,conseils .,O O
3467,,


In [30]:
classifier = XGBClassifier()
classifier.fit(X_tfidf, y)
generateFileOutBigram(classifier, test_df['bigram'], "FR/dev.xgboost.bigram.out", vect, le)