This is Charles' first try at the modelling of the reddit data

### Importing

In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from textblob import TextBlob

In [92]:
csv = pd.read_csv("../data/combined_cleaned_vectorized.csv")

#make new df with only the text and the label columns
df = csv[['text', 'label']].copy()
y = df['label']

### Adding features to see the lexical diversity and sentiment of each post

In [93]:
# Adding a feature to see lexical diversity
df.loc[:, 'lexical_diversity'] = df['text'].apply(lambda x: len(set(x.split())) / len(x.split()) if len(x.split()) > 0 else 0)

# Adding a feature to see sentiment
df.loc[:, 'sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)



AttributeError: 'int' object has no attribute 'split'

### Topic Modelling Should also be added as a feature

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel

# Create a dictionary from the data
dictionary = corpora.Dictionary(df['text'].apply(lambda x: x.split()))

# Create a bag of words corpus by passing the tokenized list of words to the dictionary
corpus = [dictionary.doc2bow(text) for text in df['text'].apply(lambda x: x.split())]

# Initialise the LDA model and fit
num_topics = 10  # change this based on your understanding of the data
lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# get the topic distribution of each document and take the most probable topic as label
df.loc[:, 'topic'] = df['text'].apply(lambda x: sorted(lda[dictionary.doc2bow(x.split())], key=lambda item: -item[1])[0][0])



In [None]:
df['topic'] = df['topic'].astype('category')
topic_dummies = pd.get_dummies(df['topic'], prefix='topic')


In [None]:
df = pd.concat([df, topic_dummies], axis=1)


### Finally, use word vectorization to improve the model some more

In [None]:
# To include this feature in your model, you'll need to adjust your train/test split:
X = df[['text', 'lexical_diversity', 'sentiment', 'topic']]
y = df['label']


In [None]:
print(df.head())

### Train and Test Splitting

In [None]:
# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### Excluding Words from Dataset and Converting text into token counts

In [None]:
# Define custom stop words
custom_stop_words = []

In [None]:
# Convert text into matrix of token counts and then transform a count matrix to a normalized tf-idf representation
pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2), stop_words=custom_stop_words)),
    ('tfidf', TfidfTransformer()),
])

X_train_transformed = pipeline.fit_transform(X_train['text'])
X_test_transformed = pipeline.transform(X_test['text'])


In [None]:
X_train_df = pd.DataFrame(X_train_transformed.toarray(),
    columns=pipeline['vect'].get_feature_names_out())
X_train_df

In [None]:

# Define your text processing pipeline
text_pipeline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1), stop_words=custom_stop_words)),
    ('tfidf', TfidfTransformer()),
])



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_pipeline, 'text'),
        ('num', StandardScaler(), ['lexical_diversity','sentiment']),
        ('topic', 'passthrough', topic_dummies.columns),
    ])


### Modelling

In [None]:
# Initialize models
clf1 = RandomForestClassifier(n_estimators=1000, random_state=42)
clf2 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=100)
clf3 = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=3, random_state=42)

# Create ensemble model
eclf = VotingClassifier(estimators=[('rf', clf1), ('adb', clf2), ('gb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Random Forest', 'AdaBoost', 'GradientBoost', 'Ensemble']):
    clf.fit(X_train_transformed, y_train)
    print("Accuracy: %0.2f [%s]" % (clf.score(X_test_transformed, y_test), label))
