In [2]:
import pandas as pd
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
columns = [  0,   1,   2,   5,   9,  10,  11,  15,  17,  18,  19,  30,  31,
        32,  35,  36,  38,  39,  40,  42,  45,  50,  51,  53,  57,  61,
        62,  68,  72,  73,  74,  76,  77,  80,  81,  82,  86,  87,  88,
        89,  90,  99, 100, 102, 105, 107, 108, 109, 110, 111, 114, 115,
       118, 119, 120, 121, 125, 127, 130, 131, 137, 150, 151, 156, 157,
       159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 171, 172, 176,
       177, 178, 179, 180, 181, 182, 183, 185, 190, 194, 195, 196, 197,
       200, 206, 207, 212, 214, 218, 219, 220, 224, 225, 230, 233, 236,
       237, 239, 240, 243]

In [7]:
vectorizer = CountVectorizer(
        max_df=0.8625878562494729, min_df=0.008728921136403306, dtype=np.float64)
X_train = vectorizer.fit_transform(pd.read_csv("sms_train.csv")["message"])
y_train = pd.read_csv("sms_train.csv")["label"]

In [10]:
model = LinearSVC(C=23.156370429386005, penalty='l1', dual=False, class_weight="balanced",
        max_iter=1_000_000)
X_selected = X_train[:, columns]
idx_to_word = {idx: w for w, idx in vectorizer.vocabulary_.items()}
columns_names = [idx_to_word[c] for c in columns]
columns_names

['150p',
 '16',
 '50',
 'again',
 'always',
 'am',
 'amp',
 'anything',
 'around',
 'as',
 'ask',
 'call',
 'can',
 'cant',
 'chat',
 'claim',
 'com',
 'come',
 'coming',
 'cos',
 'da',
 'doing',
 'don',
 'down',
 'feel',
 'free',
 'friends',
 'gonna',
 'gt',
 'guaranteed',
 'gud',
 'haha',
 'happy',
 'he',
 'help',
 'her',
 'him',
 'his',
 'home',
 'hope',
 'how',
 'know',
 'last',
 'later',
 'life',
 'll',
 'lol',
 'lor',
 'love',
 'lt',
 'may',
 'me',
 'min',
 'mins',
 'miss',
 'mobile',
 'msg',
 'my',
 'new',
 'next',
 'now',
 'over',
 'per',
 'pls',
 'prize',
 'really',
 'reply',
 'right',
 'said',
 'same',
 'say',
 'send',
 'sent',
 'service',
 'she',
 'sleep',
 'sms',
 'something',
 'soon',
 'sorry',
 'special',
 'still',
 'stop',
 'sure',
 'take',
 'text',
 'then',
 'things',
 'think',
 'this',
 'thk',
 'today',
 'txt',
 'uk',
 'use',
 'very',
 'was',
 'wat',
 'way',
 'were',
 'what',
 'why',
 'with',
 'would',
 'www',
 'yeah',
 'year',
 'your']

In [11]:
model.fit(X_selected, y_train)

In [69]:
coefs = model.coef_.reshape(-1)
column_weight = {col: c for (col, c) in zip(columns_names, list(coefs))}
column_weight = pd.DataFrame(column_weight, index=[0]).transpose()
column_weight.columns = ["coef"]
positive = column_weight[column_weight.coef > 0]
negative = column_weight[column_weight.coef < 0]

In [70]:
def save_word_list(df, fname):
    df.loc[:,'weight'] = np.abs(np.round(df.coef * 100).astype(int))
    df = df.reset_index()
    df.columns = ["word", "coef", "weight"]
    df.loc[:,'color'] = ""
    df.loc[:,'url'] = ""
    df.drop(["coef"], axis=1).loc[:, ["weight", "word", "color", "url"]].to_csv(fname, index=False)

In [71]:
save_word_list(positive, "positive_words.csv")
save_word_list(negative, "negative_words.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'weight'] = np.abs(np.round(df.coef * 100).astype(int))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'weight'] = np.abs(np.round(df.coef * 100).astype(int))


In [72]:
def save_word_list_color(df, fname):
    df.loc[:,'weight'] = np.abs(np.round(df.coef * 100).astype(int))
    df = df.reset_index()
    df.columns = ["word", "coef", "weight"]
    df.loc[:,'color'] = df.coef.apply(lambda c: "#0ad11e" if c < 0 else "#e30b0b")
    df.loc[:,'url'] = ""
    df.drop(["coef"], axis=1).loc[:, ["weight", "word", "color", "url"]].to_csv(fname, index=False)

In [73]:
save_word_list_color(column_weight, "colored_words.csv")