In [1]:
#To scrape movie reviews from IMDb using BeautifulSoup and requests, you can use the following code:

# Import libraries
import requests
from bs4 import BeautifulSoup

# Define the base URL and the number of pages to scrape
base_url = "https://www.imdb.com/search/title/?title_type=feature&user_rating=1.0,10.0&num_votes=10000,&genres=action&sort=user_rating,desc&start="
num_pages = 10

# Create an empty list to store the reviews
reviews = []

# Loop through the pages
for i in range(num_pages):
    # Construct the URL for each page
    url = base_url + str(i*50+1)
    # Make a GET request and parse the HTML
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # Find all the div elements that contain the reviews
    divs = soup.find_all("div", class_="lister-item-content")
    # Loop through the divs and extract the review text
    for div in divs:
        # Find the p element that contains the review text
        p = div.find("p", class_="text-muted")
        # Get the text and strip any whitespace
        review = p.text.strip()
        # Append the review to the list
        reviews.append(review)


In [2]:
#To perform text preprocessing, tokenization, lemmatization, and vectorization using NLTK and Scikit-learn, you can use the following code:

# Import libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a function to preprocess the text
def preprocess(text):
    # Convert the text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub("[^a-z]", " ", text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = stopwords.words("english")
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Return the tokens as a string
    return " ".join(tokens)

# Apply the preprocess function to the reviews
reviews = [preprocess(review) for review in reviews]

# Create a TF-IDF vectorizer and fit it to the reviews
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(reviews)


ModuleNotFoundError: No module named 'nltk'

In [None]:
#To train and evaluate various machine learning models using Scikit-learn and TensorFlow, you can use the following code:

# Import libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Split the data into train and test sets
y = movie_reviews.target # Assuming you have the labels in a variable called movie_reviews.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    # Train the model on the train set
    model.fit(X_train, y_train)
    # Predict the labels on the test set
    y_pred = model.predict(X_test)
    # Compute and return the accuracy score
    return accuracy_score(y_test, y_pred)

# Create and evaluate a logistic regression model
log_reg = LogisticRegression()
log_reg_acc = train_and_evaluate(log_reg, X_train, X_test, y_train, y_test)
print(f"Logistic regression accuracy: {log_reg_acc}")

# Create and evaluate a naive Bayes model
nb = MultinomialNB()
nb_acc = train_and_evaluate(nb, X_train, X_test, y_train, y_test)
print(f"Naive Bayes accuracy: {nb_acc}")

# Create and evaluate a SVM model
svm = LinearSVC()
svm_acc = train_and_evaluate(svm, X_train, X_test, y_train, y_test)
print(f"SVM accuracy: {svm_acc}")

# Create and evaluate a neural network model
# Convert the TF-IDF matrix to a dense array
X_train = X_train.toarray()
X_test = X_test.toarray()
# Define the model parameters
vocab_size = len(vectorizer.vocabulary_)
embedding_dim = 50
max_length = X_train.shape[1]
# Create the model
nn = Sequential()
nn.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
nn.add(LSTM(32))
nn.add(Dropout(0.2))
nn.add(Dense(1, activation="sigmoid"))
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# Train the model
nn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
# Evaluate the model
nn_acc = nn.evaluate(X_test, y_test)[1]
print(f"Neural network accuracy: {nn_acc}")


In [None]:
#To visualize the results and insights using Matplotlib, Seaborn, and WordCloud, you can use the following code:

# Import libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Plot the accuracy scores of the models
models = ["Logistic Regression", "Naive Bayes", "SVM", "Neural Network"]
accuracies = [log_reg_acc, nb_acc, svm_acc, nn_acc]
sns.barplot(x=models, y=accuracies)
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.show()

# Plot the confusion matrix of the best model (assuming it is the neural network)
y_pred = nn.predict_classes(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Generate a word cloud of the most frequent words in the positive reviews
positive_reviews = [review for review, label in zip(reviews, y) if label == 1]
positive_text = " ".join(positive_reviews)
wordcloud = WordCloud(background_color="white").generate(positive_text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Positive Reviews")
plt.show()
