In [1]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Load the datasets
deniers = pd.read_csv('/home/users/mhossain/projects/cards/data/deniers.csv')
believers = pd.read_csv('/home/users/mhossain/projects/cards/data/deniers.csv')

In [3]:
deniers

Unnamed: 0,body
0,Although the film crew swore up and down they ...
1,That is entirely what the other papers say. I...
2,Climate change will always exist and always ha...
3,The biggest problem is that the main water rig...
4,Are you suggesting some conspiracy by NOAA to ...
...,...
46519,&gt;the discovery that warmer air does not hol...
46520,Perhaps it is better explained here where it i...
46521,You should be bothered. This blog post details...
46522,&gt;The whole argument that anthropological CO...


In [4]:
# Combine the datasets
deniers['label'] = 0  # Label for deniers
believers['label'] = 1  # Label for believers
combined = pd.concat([deniers, believers], ignore_index=True)

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Assuming 'sample' is your pandas Series containing text data

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = ' '.join([lemmatizer.lemmatize(word) for word in words])
    return words


In [8]:
# Preprocess text data
deniers['processed_str'] = deniers['body'].apply(preprocess_text)
believers['processed_str'] = believers['body'].apply(preprocess_text)


In [9]:
deniers['processed_str'] 

0        although film crew swore didnt anything upset ...
1        entirely paper say long term carbonite chemist...
2        climate change always exist always change extr...
3        biggest problem main water right colorado rive...
4        suggesting conspiracy noaa artificially amplif...
                               ...                        
46519    gtthe discovery warmer air hold water vapor on...
46520    perhaps better explained called manufacted dou...
46521    bothered blog post detail broadly httplippardb...
46522    gtthe whole argument anthropological co emissi...
46523    well must say seems like scathing attack anyon...
Name: processed_str, Length: 46524, dtype: object

In [10]:
# Combine datasets
# Apply preprocessing
combined['processed'] = combined['body'].apply(preprocess_text)

In [11]:
combined.head()

Unnamed: 0,body,label,processed
0,Although the film crew swore up and down they ...,0,although film crew swore didnt anything upset ...
1,That is entirely what the other papers say. I...,0,entirely paper say long term carbonite chemist...
2,Climate change will always exist and always ha...,0,climate change always exist always change extr...
3,The biggest problem is that the main water rig...,0,biggest problem main water right colorado rive...
4,Are you suggesting some conspiracy by NOAA to ...,0,suggesting conspiracy noaa artificially amplif...


In [12]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(combined['processed'], combined['label'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [13]:
combined['processed'][3]

'biggest problem main water right colorado river divided period heavy rain water allocated typically exists even without considering climate change major problem need solved time true southwest midst year drought le rain people need water river hotter people plant animal need water water le plentiful going make even harder fix original problem allocation bad forestry practice compounded longer severe fire season hard controlled burn weather doesnt allow effectively control blaze wouldnt care climate change didnt threaten way life pointing human behavior partly responsible risk isnt particularly useful insight'

In [14]:
X_test_vec

<18610x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 656196 stored elements in Compressed Sparse Row format>

In [15]:
# Train a classifier (Logistic Regression in this case)
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train_vec, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test_vec)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.43      0.41      0.42      9340
           1       0.43      0.44      0.43      9270

    accuracy                           0.43     18610
   macro avg       0.43      0.43      0.43     18610
weighted avg       0.43      0.43      0.43     18610



In [10]:
# Create synthetic labels based on LDA topics for demonstration purposes
# Normally, you would have actual labels for supervised learning
data['label'] = [0] * len(deniers) + [1] * len(believers)

In [11]:
data['label']

0        0
1        0
2        0
3        0
4        0
        ..
93043    1
93044    1
93045    1
93046    1
93047    1
Name: label, Length: 93048, dtype: int64

In [12]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['processed_str'], data['label'], test_size=0.2, random_state=42)

In [13]:
X_train

4938     apart ideal aggressive way shes debating theme...
92950    author summary judith curry article gt changin...
9338     gt climate science sun ignored claim responsib...
26122    ill take time explain point hope clarifies con...
33970    im pretending famine im saying million people ...
                               ...                        
6265     guy phd industrial engineering stem subject me...
54886    gtit strike dont understand difficulty predict...
76820    none thing beginning dont count forcings haven...
860      dont worry trying take away air conditioning c...
15795    heh ill give go would identify politically im ...
Name: processed_str, Length: 74438, dtype: object

In [14]:
X_test

17832    gti really care phil jones think others disput...
68857    nothing disprove climate change climate change...
61833    propaganda overlaid climate change newspeak us...
31486    gtwell even author concluded unlikely cause re...
22610    gtnot wrong condemning u kid world afflicted c...
                               ...                        
50579    globe warming whats climate change apart tax scam
31251    australian government tried rescoping climate ...
27743    gtthe u department justice doj last thursday f...
43165    well gosh thats true isnt since warm record br...
63426    yes end around anthropogenic climate change ag...
Name: processed_str, Length: 18610, dtype: object

In [15]:
# Create a pipeline with TF-IDF Vectorizer and Multinomial Naive Bayes Classifier
model = make_pipeline(TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english'), MultinomialNB())

In [16]:
# Train the classifier
model.fit(X_train, y_train)

In [17]:
# Predict classes for the test set
y_pred = model.predict(X_test)

In [18]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Classifier Accuracy: {accuracy:.2f}")

Classifier Accuracy: 0.21


In [19]:
# Add predictions to the original data
data['predicted_label'] = model.predict(data['processed_str'])

In [20]:
data['predicted_label']

0        1
1        0
2        0
3        1
4        0
        ..
93043    1
93044    0
93045    1
93046    1
93047    0
Name: predicted_label, Length: 93048, dtype: int64

In [21]:
# Save results to CSV
output = data[['body', 'predicted_label']]
output.to_csv('reddit_classification_results.csv', index=False)

print("Results saved to 'reddit_classification_results.csv'")

Results saved to 'reddit_classification_results.csv'
