In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# Load data
train_data = pd.read_csv('final_train.csv')
val_data = pd.read_csv('final_test.csv')

# Convert data to a list of lists
def convert_data(data):
    X = []
    y = []
    for i, row in data.iterrows():
        words = row['sentence'].split()
        tags = row['tags'].split()
        X.append(words)
        y.append(tags)
    return X, y

X_train, y_train = convert_data(train_data)
X_val, y_val = convert_data(val_data)

# Define features for SVM
def word2features(sent, i):
    word = sent[i]
    features = {
        'word.lower()': word.lower(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        prev_word = sent[i-1]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isdigit()': prev_word.isdigit()
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        next_word = sent[i+1]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isdigit()': next_word.isdigit()
        })
    else:
        features['EOS'] = True
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return sent

# Extract features for training and validation sets
X_train = [sent2features(s) for s in X_train]
X_val = [sent2features(s) for s in X_val]
y_train = [sent2labels(s) for s in y_train]

# Vectorize features
vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform([feature for sent in X_train for feature in sent])
X_val_vec = vectorizer.transform([feature for sent in X_val for feature in sent])

# Flatten y_train and y_val
y_train_flat = [label for sent in y_train for label in sent]

# Train SVM model
svm = LinearSVC()
svm.fit(X_train_vec, y_train_flat)

# Predict labels for validation set
y_pred = svm.predict(X_val_vec)

# Convert y_val to a flat list
y_val_flat = [label for sent in y_val for label in sent]

# Print classification report
print(classification_report(y_val_flat, y_pred))

print("Overall F1 score:", f1_score(y_val_flat, y_pred, average='weighted'))

              precision    recall  f1-score   support

          DF       0.95      0.97      0.96       196
        NAME       0.91      0.97      0.94      3643
           O       0.94      0.85      0.90      3660
    QUANTITY       0.99      0.99      0.99      2152
        SIZE       0.97      0.96      0.97       103
       STATE       0.94      0.96      0.95      1092
        TEMP       0.80      0.86      0.83        43
        UNIT       0.94      0.98      0.96      1777

    accuracy                           0.94     12666
   macro avg       0.93      0.94      0.94     12666
weighted avg       0.94      0.94      0.94     12666

Overall F1 score: 0.9379162911332063
