In [163]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer
from scipy import sparse
import numpy as np

In [164]:
X_train = pd.read_csv('../datasets/X_train_w_SA.csv')
X_test = pd.read_csv('../datasets/X_test_w_SA.csv')
y_train = pd.read_csv('../datasets/y_train.csv')
y_test = pd.read_csv('../datasets/y_test.csv')

y_train['train_dataset'] = 1
y_test['train_dataset'] = 0

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

df = pd.concat([X, y], axis = 1)

In [165]:
sparse_vec = sparse.load_npz('../datasets/sparse_vec_df.npz')
vec_df = pd.DataFrame.sparse.from_spmatrix(sparse_vec)
vec_df.columns = pd.read_csv('../datasets/vec_df_cols.csv').loc[:,'0']

In [166]:
vec_df.head()

0,aal,aardvark,aba,aback,abacus,abandon,abandoned,abandoning,abandonment,abandons,...,zonation,zone,zoned,zones,zoning,zoo,zoom,zorro,zu,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39853,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39856,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [167]:
df['all_text'] = [np.nan]*df.shape[0]

for i in range(df.shape[0]):
    text = df.loc[i, 'text']
    title = df.loc[i, 'title']
    all_text = title + ' ' + text
    df.loc[i,'all_text'] = all_text

In [168]:
stemmer = PorterStemmer()

word_list = vec_df.columns

In [169]:
df['stemmed'] = [np.nan]*df.shape[0]

for i, t in enumerate(list(df['all_text'])):
    if t[0] == ' ':
        t = t[1:]
    stemmed = []
    for word in t.split(' '):
        if word.lower() in word_list:
            stem = stemmer.stem(word)
            stemmed.append(stem)
        else:
            pass
            
            
    df.loc[i,'stemmed'] = ' '.join(stemmed)
    if (i % 3000) == 0:
        print(i)

0
3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000
36000
39000


In [170]:
cv = CountVectorizer(stop_words='english')
cv.fit(df['stemmed'])

words_cv = cv.transform(df['stemmed'])

stems_df = pd.DataFrame(words_cv.todense(), columns=cv.get_feature_names())

In [171]:
r_string = ''
f_string = ''

for i in range(df.shape[0]):
    string = df.loc[i,'stemmed']
    if df.loc[i,'is_true'] == 1:
        r_string += ' '+string
    else: 
        f_string += ' '+string

tvec = TfidfVectorizer(stop_words='english')

tvec.fit([r_string,f_string])

tv = pd.DataFrame(tvec.transform([r_string, f_string]).todense(),
                   columns=tvec.get_feature_names(),
                   index=['real', 'fake'])

In [172]:
stems_df.shape

(39858, 15566)

In [175]:
tv_t = tv.T

r_words = set(tv_t[tv_t['fake'] > 0.01].index)
f_words = set(tv_t[tv_t['fake'] > 0.01].index)
selected_words = list(r_words.union(f_words))
selected_words.sort()
len(selected_words)

577

In [178]:
full_df = df.merge(stems_df[selected_words], right_index = True, left_index = True)

In [180]:
full_df.shape

(39858, 633)

In [204]:
feats = list(full_df.columns[3:52])
feats.extend(selected_words)
len(feats)

626

In [205]:
full_df.loc[full_df['train_dataset'] == 1, feats].to_csv('../datasets/X_train_full.csv', index = False)
full_df.loc[full_df['train_dataset'] == 0, feats].to_csv('../datasets/X_test_full.csv', index = False)