<a href="https://colab.research.google.com/github/farnaz-orooji/nlp/blob/main/NER_ML_classification_using_TFIDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
%%time
ds = pd.read_csv("//content/drive/MyDrive/nlp_dataset/ner_dataset.csv", encoding = 'ISO-8859-1')

CPU times: user 377 ms, sys: 83.7 ms, total: 460 ms
Wall time: 1.44 s


In [7]:
# Return a random sample of items from an axis of object.
df = ds.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,,stemming,VBG,O
1,,",",",",O
2,,expected,VBN,O
3,,in,IN,O
4,,tourists,NNS,O


In [8]:
ds = ds.fillna(method='ffill')
ds['category_id'] = ds["Tag"].factorize()[0]

In [9]:
ds.isnull().sum()
ds.head()

Unnamed: 0,Sentence #,Word,POS,Tag,category_id
0,Sentence: 1,Thousands,NNS,O,0
1,Sentence: 1,of,IN,O,0
2,Sentence: 1,demonstrators,NNS,O,0
3,Sentence: 1,have,VBP,O,0
4,Sentence: 1,marched,VBN,O,0


In [10]:
df['new']= df['Word'].map(lambda s : s.lower())

In [11]:
ds['category_id'] = ds['Tag'].factorize()[0]

from io import StringIO
# category_id_df = ds[['Tag', 'category_id']].drop_duplicates().sort_values('category_id')
category_id_df = ds[['Tag', 'category_id']].sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Tag']].values)


In [12]:
ss = ds.groupby('category_id').Word.count()

### split dataset into train, test




In [13]:
from collections import Counter
X = ds["Word"]
y = ds.category_id

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3, stratify=y)

tfidf = TfidfVectorizer(stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

## dealing with imbalanced dataset

In [15]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN

under_sampler = RandomUnderSampler(random_state=42)
over_sampler = RandomOverSampler(random_state=42)
sm = SMOTE(random_state=42)
sm_enn = SMOTEENN(random_state=42)
sm_tomek = SMOTETomek(random_state=42)

X_train_new, y_train_new = over_sampler.fit_resample(X_train_tf, y_train)
# features, labels = sm.fit_resample(X_train_tf, y_train)
# features, labels = sm_enn.fit_resample(X_train_tf, y_train)
# features, labels = sm_tomek.fit_resample(X_train_tf, y_train)

print("The number of classes before fit {}".format(Counter(y_train)))
print("------")
print("The number of classes after fit {}".format(Counter(y_train_new)))

The number of classes before fit Counter({0: 621535, 1: 26351, 7: 14233, 5: 14100, 10: 12076, 3: 11893, 6: 11749, 2: 11109, 4: 5190, 12: 4569, 8: 281, 14: 215, 9: 208, 15: 177, 13: 141, 11: 139, 16: 36})
------
The number of classes after fit Counter({0: 621535, 1: 621535, 7: 621535, 4: 621535, 2: 621535, 6: 621535, 5: 621535, 10: 621535, 3: 621535, 12: 621535, 14: 621535, 9: 621535, 8: 621535, 13: 621535, 15: 621535, 16: 621535, 11: 621535})


## LogisticRegression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
clf = LogisticRegression()
clf.fit(X_train_new, y_train_new)
y_pred = clf.predict(X_test_tf)
print(metrics.classification_report(y_test, y_pred))
print(metrics.f1_score(y_test, y_pred, average='weighted'))
# models = [
# #     RandomForestClassifier(n_estimators=50, max_depth=3, random_state=0),
# #     LinearSVC(),
#     # MultinomialNB()]
#     LogisticRegression(random_state=0)]

# CV = 5
# cv_df = pd.DataFrame(index=range(CV * len(models)))
# entries = []
# for model in models:
#     model_name = model.__class__.__name__
#     accuracies = cross_val_score(model, X_train_new, y_train_new, scoring='f1_macro', cv=CV)
#     for fold_idx, accuracy in enumerate(accuracies):  
#         entries.append((model_name, fold_idx, accuracy))      
# cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'f1_macro'])
# import seaborn as sns
# sns.boxplot(x='model_name', y='f1_macro', data=cv_df)
# sns.stripplot(x='model_name', y='f1_macro', data=cv_df, 
#               size=8, jitter=True, edgecolor="gray", linewidth=2)
# plt.show()

              precision    recall  f1-score   support

           0       0.97      0.89      0.93    266373
           1       0.78      0.61      0.69     11293
           2       0.90      0.88      0.89      4761
           3       0.69      0.62      0.65      5097
           4       0.36      0.55      0.43      2224
           5       0.40      0.47      0.43      6043
           6       0.20      0.46      0.28      5035
           7       0.70      0.67      0.69      6100
           8       0.02      0.36      0.04       121
           9       0.01      0.19      0.01        89
          10       0.66      0.66      0.66      5175
          11       0.04      0.66      0.08        59
          12       0.15      0.43      0.22      1959
          13       0.09      0.58      0.15        60
          14       0.03      0.62      0.05        93
          15       0.01      0.47      0.02        76
          16       0.02      0.60      0.04        15

    accuracy              

In [13]:
cv_df.groupby('model_name').f1_macro.mean()

model_name
LogisticRegression    0.794465
Name: f1_macro, dtype: float64

## RandomForrest

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# classifier=MultinomialNB()
# RFC = RandomForestClassifier(class_weight="balanced")
classifier =  RandomForestClassifier()
# classifier = LogisticRegression(class_weight='balanced')
classifier.fit(X_train_new, y_train_new)
y_pred = classifier.predict(X_test_tf)

In [20]:
from sklearn import metrics
print(metrics.f1_score(y_test, y_pred, average= "weighted"))

0.882197273757114


## gradientBoostingClassifier

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(X_train_tf, y_train)
y_pred = model.predict(X_test_tf)

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
f1_score = metrics.f1_score(y_test, y_pred)
f1_score

In [None]:
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]train_results = []
test_results = []
for eta in learning_rates:
   model = GradientBoostingClassifier(learning_rate=eta)
   model.fit(x_train, y_train)   
   train_pred = model.predict(x_train)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)   
   y_pred = model.predict(x_test)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
for estimator in n_estimators:
   model = GradientBoostingClassifier(n_estimators=estimator)
   model.fit(x_train, y_train)   
   train_pred = model.predict(x_train)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)   
   y_pred = model.predict(x_test)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, ‘b’, label=”Train AUC”)
line2, = plt.plot(n_estimators, test_results, ‘r’, label=”Test AUC”)plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})plt.ylabel(‘AUC score’)
plt.xlabel(‘n_estimators’)
plt.show()

In [None]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   model = GradientBoostingClassifier(max_depth=max_depth)
   model.fit(x_train, y_train)   
   train_pred = model.predict(x_train)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)   
   y_pred = model.predict(x_test)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_depths, train_results, ‘b’, label=”Train AUC”)
line2, = plt.plot(max_depths, test_results, ‘r’, label=”Test AUC”)
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})plt.ylabel(‘AUC score’)
plt.xlabel(‘Tree depth’)
plt.show()

In [None]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
   model = GradientBoostingClassifier(min_samples_split=min_samples_split)
   model.fit(x_train, y_train)   
   train_pred = model.predict(x_train)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)   
   y_pred = model.predict(x_test)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_splits, train_results, ‘b’, label=”Train AUC”)
line2, = plt.plot(min_samples_splits, test_results, ‘r’, label=”Test AUC”)
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})plt.ylabel(‘AUC score’)
plt.xlabel(‘min samples split’)
plt.show()

In [None]:
max_features = list(range(1,train.shape[1]))
train_results = []
test_results = []
for max_feature in max_features:
   model = GradientBoostingClassifier(max_features=max_feature)
   model.fit(x_train, y_train)   
   train_pred = model.predict(x_train)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, train_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)   
   y_pred = model.predict(x_test)   
   false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
   roc_auc = auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(max_features, train_results, ‘b’, label=”Train AUC”)
line2, = plt.plot(max_features, test_results, ‘r’, label=”Test AUC”)
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})plt.ylabel(‘AUC score’)
plt.xlabel(‘max features’)
plt.show()

In [None]:
# to add more features into our dataframe like : POS , lemma, ...
import spacy
nlp = spacy.load('en_core_web_sm')
tokens = []
lemma = []
pos = []
ds = ds[:10]
for doc in nlp.pipe(ds['Word'].astype("unicode").values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text.lower() for n in doc])
        lemma.append([n.lemma_.lower() for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

ds['species_tokens'] = tokens
ds['species_lemma'] = lemma
ds['species_pos'] = pos
# print(ds.head())
print(tokens, lemma, pos)

In [None]:
ds = ds[:100000]

In [None]:
X = ds.drop(['Tag', 'POS'], axis=1)

In [None]:
# dictvectorizer : transforms lists of feature-value mappings to vectors
v = DictVectorizer(sparse=False)
X= v.fit_transform(X.to_dict('records'))
# also X.invverse_transform return back the list to the featue-value (dict-like)

In [None]:
y = ds.Tag.values

In [None]:
classes = np.unique(y)
classes = classes.tolist()

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape

In [None]:
new_classes = classes.copy()
new_classes.pop() # to remove the 'O' tag from the classes!

In [None]:
# test 
a = ds.Tag
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
a_lab = labelencoder_x.fit_transform(a)
# a_lab[:40]
# Transform into a Matrix
onehotencoder1 = OneHotEncoder()
b = onehotencoder1.fit_transform(a_lab.reshape(-1,1)).toarray()
# b[:10]

In [None]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [None]:
y_pred = sgd.predict(X_test)
print(classification_report(y_pred, y_test,new_classes))