In [1]:
import pandas as pd
import numpy as np
import statistics

In [2]:
df = pd.read_csv('data_10_freq.csv')
df_no_freq = pd.read_csv('data_no_freq.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorize = TfidfVectorizer()

In [4]:
id_test = [ 427, 1676, 1383, 2392, 2071,  362, 1006, 1679,  544, 1859, 1725,
       2231, 2728,  332,  696, 2552, 1910,  824, 1423,  413,  975, 2096,
        549, 1233,  841, 2859, 2453, 2874,  550, 1896,  874, 1740, 2280,
        775, 1587,  793,  143, 2876, 1261, 2695, 1805, 1461, 2944, 1164,
        984,  849, 1634, 2708, 1878,  395,  510,  445, 2655,  965, 2946,
        832, 2219, 2198, 1140, 1486, 2909,  443,  620,  302, 2477, 1496,
       2566, 1621,  561, 1593, 1635, 2308, 2706, 1094,  199, 2916, 2524,
       2438, 1566,  599, 2353,  967,  796, 1835, 1337, 1963, 1817, 2836,
         32, 2482,   67, 2217, 1545, 1287, 1647, 2831, 1842, 2042,  304,
       2434,  980,  872, 1075,  326, 1494, 2298, 2568, 2619, 2448, 1471,
        316, 2683,  513, 2051, 1179, 2960, 2032, 2764,  201, 1352, 2366,
        897, 1907,  663,   30,  452, 1073, 2809,  585, 1660, 1331, 1639,
        182,  894, 1611, 1820, 2500, 2969, 2534, 1881, 2522, 2344,   73,
       1257,   63, 1701,  216,   45, 1452, 2301, 1230, 1518,   80, 2601,
       2844, 1153, 1021,  494, 2557, 1474, 1442,  420, 1681, 2651, 1533,
        713, 2799, 1557, 2243, 2362, 2903,  617, 1711, 1252, 1370,  628,
         44, 1469,  473, 1546, 2758, 2271,  446, 2875, 2865,  339,  423,
         93, 1536,  179,  801,  449, 1543, 1517,  470, 1421,  581,  862,
        534,  590, 2476, 2413,  522,  376, 2447, 2518,  244, 2412, 2886,
        463, 1892, 1777, 2107, 1816, 2416, 2327, 1246, 1234, 2797,  476,
        612,  560,  422,   70,  915, 2689,  501, 1145,   56, 1247, 1673,
       2098, 2869, 1491,  329,  102,  404, 2466, 1108,  500, 2664, 1814,
       2127, 1363, 2631, 2321, 2307,  126,  432, 2041,  545,  616, 1083,
       1935,  681, 2106,  750, 2160, 1666, 2444, 1137, 1736, 2281, 1703,
       1417, 2850,  238, 2312, 1825, 2840, 1136, 1318,  525, 1932,  904,
       1705, 2275, 1131, 2471, 1204, 2295, 2025, 2861, 1299, 1147, 2819,
       2250, 1291,   29, 2621,  880, 1268, 1090, 1631, 1784,  193,  173,
       1824,   43,   87, 2672,  865,  288, 2816, 2796,  661, 1005, 1283,
       2635, 2446,  151,  538, 1525]

In [5]:
df_train = df[~df['id'].isin(id_test)]
df_test = df_no_freq[df_no_freq['id'].isin(id_test)]

In [6]:
# TF-IDF on training data
tfidf_train = vectorize.fit_transform(df_train['text'])

# TF-IDF on test data
tfidf_test = vectorize.transform(df_test['text'])

In [7]:
x_train = tfidf_train
x_test = tfidf_test
y_train = df_train[['1', '2', '3', '4', '5', '6', '7', '8']]
y_test = df_test[['1', '2', '3', '4', '5', '6', '7', '8']]

In [8]:
from sklearn.svm import SVC

# Define class weights for each SVM model
ratio = {1: 0.51, 2: 0.34, 3: 0.34, 4: 0.26, 5: 0.11, 6: 0.2, 7: 0.05, 8: 0.06}

svm_models = {}

for i in range(1, 9):
    # Extracting the labels for the current class
    y_train_class = df_train[str(i)]
    y_test_class = df_test[str(i)]

    # Build and train SVC model for the current class
    svc = SVC(kernel='rbf', C=10, gamma=0.1, probability=True, class_weight={False:1/(2*(1-ratio[i])), True:1/(2*ratio[i])})
    svc.fit(x_train, y_train_class)
    
    # Store the trained model
    svm_models[i] = svc

# Predict probabilities for each class separately
pred_prob = {i: model.predict_proba(x_test)[:, 1] for i, model in svm_models.items()}
pred = {i: model.predict(x_test) for i, model in svm_models.items()}

In [9]:
df_pred = pd.DataFrame(df_test[['id', '1', '2', '3', '4', '5', '6', '7', '8']])

In [10]:
for i in range(8):  
    i_pred = pred_prob[i+1]
    df_pred[str(i+1) + '_pred'] = i_pred

In [11]:
df_pred['1_pred']

29      0.522190
30      0.567542
32      0.232239
43      0.592884
44      0.393615
          ...   
2832    0.219421
2860    0.507652
2862    0.345988
2875    0.415167
2884    0.233965
Name: 1_pred, Length: 313, dtype: float64

In [12]:
df_pred.to_csv('result_10.csv')

In [13]:
from sklearn.metrics import f1_score

# Calculate F1 score for each output separately
f1_scores = []
for i, column in enumerate(['1', '2', '3', '4', '5', '6', '7', '8']):
    f1 = f1_score(df_test[column], pred[i+1])
    f1_scores.append(f1)

# Print F1 score for each output
for i, f1 in enumerate(f1_scores):
    print(f'F1 score for column {i+1}: {f1:.4f}')

print(f'average: {statistics.mean(f1_scores)}')

F1 score for column 1: 0.6624
F1 score for column 2: 0.5701
F1 score for column 3: 0.4474
F1 score for column 4: 0.6329
F1 score for column 5: 0.1667
F1 score for column 6: 0.5000
F1 score for column 7: 0.4444
F1 score for column 8: 0.0909
average: 0.4393570180312296


In [14]:
from sklearn.metrics import accuracy_score

# Calculate F1 score for each output separately
f1_scores = []
for i, column in enumerate(['1', '2', '3', '4', '5', '6', '7', '8']):
    f1 = accuracy_score(df_test[column], pred[i+1])
    f1_scores.append(f1)

# Print F1 score for each output
for i, f1 in enumerate(f1_scores):
    print(f'accuracy for column {i+1}: {f1:.4f}')

print(f'average: {statistics.mean(f1_scores)}')

accuracy for column 1: 0.6613
accuracy for column 2: 0.6965
accuracy for column 3: 0.5974
accuracy for column 4: 0.8147
accuracy for column 5: 0.8403
accuracy for column 6: 0.7700
accuracy for column 7: 0.9521
accuracy for column 8: 0.9361
average: 0.7835463258785943


In [15]:
import pickle
for i, model in svm_models.items():
    with open(f'models/clf10_for_tag_{i}.pkl', 'wb') as fid:
        pickle.dump(model, fid)

# load it again
# with open('my_dumped_classifier.pkl', 'rb') as fid:
#     gnb_loaded = cPickle.load(fid)