In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd

In [None]:
docs = [
    "I absolutely loved this movie. It was an amazing movie and wonderful!",  # positive
    "I hated this movie. It was terrible and awful."                 # negative
]
labels = ["positive", "negative"]

In [None]:
labels

In [None]:
df_reviews = pd.DataFrame({
    'review': docs,
    'label': labels
})

In [None]:
df_reviews

In [None]:
# 1. Vectorize
vec = CountVectorizer()
X = vec.fit_transform(docs)
feature_names = vec.get_feature_names_out()

In [None]:
feature_names

In [None]:
X_dense = X.toarray()
count_positive = X_dense[0]  # first document
count_negative = X_dense[1]  # second document

In [None]:
X_dense

In [None]:
# 2. Train a tiny random forest
clf = RandomForestClassifier()
clf.fit(X, labels)

# use a multinomial bayes classifier
#clf = GaussianNB()
#clf.fit(X_dense, labels)

In [None]:
clf.feature_importances_

In [None]:
# 3. Inspect feature importances

# --- Combine into DataFrame ---
df = pd.DataFrame({
    'word': feature_names,
    'count_positive': count_positive,
    'count_negative': count_negative,
    'feature_importance': clf.feature_importances_
})

# --- Sort by importance ---
df_sorted = df.sort_values(by='feature_importance', ascending=False)

In [None]:
df_sorted

In [None]:
# try changing the movie reviews
# add some words, make sure some words show up in both reviews, but have a higher count in one than the other

In [None]:
# 1. New text data (your provided documents)
new_docs = [
    "I really enjoyed this film, it was amazing!",  # new positive
    "This movie was awful and boring.",               # new negative
    "The movie was okay, not great but not terrible." # neutral
]

In [None]:
# 2. Vectorize the new data using the *fitted* CountVectorizer (vec)
#    This converts the text into the numerical features the model expects.
X_new = vec.transform(new_docs)


In [None]:
X_new.todense()

In [None]:
# 3. Make the prediction using the trained classifier (clf)

pred_labels = clf.predict(X_new)
pred_probs = clf.predict_proba(X_new)

# note that for some classifiers you need to use .toarray()
#pred_labels = clf.predict(X_new.toarray())
#pred_probs = clf.predict_proba(X_new.toarray())

In [None]:
pred_labels

In [None]:
pred_probs

In [None]:

# Create a DataFrame for clarity
df_new = pd.DataFrame({
    'review': new_docs,
    'predicted_label': pred_labels,
    'prob_positive': pred_probs[:, list(clf.classes_).index('positive')],
    'prob_negative': pred_probs[:, list(clf.classes_).index('negative')]
})

In [None]:
df_new