In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download NLTK resources (if not already downloaded)
#nltk.download('stopwords')
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/smoo/nltk_data...


In [33]:
data_path = 'https://raw.githubusercontent.com/eboyer221/CS39AA-Project/main/merged_data.csv'
df_1 = pd.read_csv(data_path)
#remove rows that have null values in either of these columns
columns_to_check = ['student_star', 'comments']

# Remove rows with null values in either of the specified columns
df_1 = df_1.dropna(subset=columns_to_check)

# Reset the index after removing rows
df_1.reset_index(drop=True, inplace=True)
# Columns to remove 
columns_to_remove = ['school_name', 'local_name', 'state_name',
                    'year_since_first_review', 'take_again', 'diff_index',
                    'tag_professor', 'post_date', 'name_onlines', 'attence',
                    'for_credits', 'would_take_agains', 'grades', 'stu_tags',
                    'help_useful', 'help_not_useful']

# Drop the specified columns
df = df_1.drop(columns=columns_to_remove)

#Change the pandas default column width to view more of the comments field
pd.set_option("display.max_colwidth", 370)

df.head()

Unnamed: 0,professor_name,department_name,star_rating,num_student,student_star,student_difficult,comments
0,Robert Olshansky,Urban & Regional Planning department,3.5,1,3.5,2.0,"Good guy, laid back and interested in his field. Class can get... a little..... slllllllloooooowwwwwwww during his junior workshop."
1,Marshall Levett,Counseling department,5.0,2,5.0,1.0,such a fun professor. really helpful and knows his stuff
2,Marshall Levett,Counseling department,5.0,2,5.0,1.0,Such a easy class. It\'s simple. Do your homework and pay attention and you will fly right by or be the person that blames him for not leaarning. He wont let you fail. just ask for help....
3,Soazig Le Bihan,Philosophy department,3.6,4,5.0,5.0,"A very hard class, and a massive amount of work. But, Soazig is also very good about explaining difficult concepts, gives excellent feedback, and is very accessible for extra assistance."
4,Soazig Le Bihan,Philosophy department,3.6,4,1.0,4.0,"Took 100 level class for Ethics offered online as an option to fill a core requirement She was terrible! Did not seem to have a grasp of the English language nor does she seem to have a grasp on reality as she insisted many times that failure in an ENTRY LEVEL, OPTIONAL class is very common due to the ""difficulty"" of material, very full of herself"


In [34]:
# Function to clean up comments text using stemming
def clean_comments_stemm(text):
    # Check if the value is a string and not NaN
    if isinstance(text, str) and text.lower() != 'nan':
        # Convert to lowercase
        text = text.lower()

        # Remove special characters, numbers, and extra whitespaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)

        # Perform stemming
        stemmer = PorterStemmer()
        words = text.split()
        words = [stemmer.stem(word) for word in words]
        text = ' '.join(words)

    return text

# Apply the clean_comments function to the 'comments' column
df['tokens_stemm'] = df['comments'].apply(clean_comments_stemm)

df.head()

Unnamed: 0,professor_name,department_name,star_rating,num_student,student_star,student_difficult,comments,tokens_stemm
0,Robert Olshansky,Urban & Regional Planning department,3.5,1,3.5,2.0,"Good guy, laid back and interested in his field. Class can get... a little..... slllllllloooooowwwwwwww during his junior workshop.",good guy laid back interest field class get littl slllllllloooooowwwwwwww junior workshop
1,Marshall Levett,Counseling department,5.0,2,5.0,1.0,such a fun professor. really helpful and knows his stuff,fun professor realli help know stuff
2,Marshall Levett,Counseling department,5.0,2,5.0,1.0,Such a easy class. It\'s simple. Do your homework and pay attention and you will fly right by or be the person that blames him for not leaarning. He wont let you fail. just ask for help....,easi class simpl homework pay attent fli right person blame leaarn wont let fail ask help
3,Soazig Le Bihan,Philosophy department,3.6,4,5.0,5.0,"A very hard class, and a massive amount of work. But, Soazig is also very good about explaining difficult concepts, gives excellent feedback, and is very accessible for extra assistance.",hard class massiv amount work soazig also good explain difficult concept give excel feedback access extra assist
4,Soazig Le Bihan,Philosophy department,3.6,4,1.0,4.0,"Took 100 level class for Ethics offered online as an option to fill a core requirement She was terrible! Did not seem to have a grasp of the English language nor does she seem to have a grasp on reality as she insisted many times that failure in an ENTRY LEVEL, OPTIONAL class is very common due to the ""difficulty"" of material, very full of herself",took level class ethic offer onlin option fill core requir terribl seem grasp english languag seem grasp realiti insist mani time failur entri level option class common due difficulti materi full


In [35]:
# Function to clean up comments text using lemmatization
def clean_comments_lemm(text):
    # Check if the value is a string and not NaN
    if isinstance(text, str) and text.lower() != 'nan':
        # Convert to lowercase
        text = text.lower()

        # Remove special characters, numbers, and extra whitespaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = text.split()
        words = [word for word in words if word not in stop_words]
        text = ' '.join(words)

        # Perform lemmatization
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        words = [lemmatizer.lemmatize(word) for word in words]
        text = ' '.join(words)

    return text

# Apply the clean_comments function with lemmatization to the 'comments' column
df['tokens_lemm'] = df['comments'].apply(clean_comments_lemm)

df.head()

Unnamed: 0,professor_name,department_name,star_rating,num_student,student_star,student_difficult,comments,tokens_stemm,tokens_lemm
0,Robert Olshansky,Urban & Regional Planning department,3.5,1,3.5,2.0,"Good guy, laid back and interested in his field. Class can get... a little..... slllllllloooooowwwwwwww during his junior workshop.",good guy laid back interest field class get littl slllllllloooooowwwwwwww junior workshop,good guy laid back interested field class get little slllllllloooooowwwwwwww junior workshop
1,Marshall Levett,Counseling department,5.0,2,5.0,1.0,such a fun professor. really helpful and knows his stuff,fun professor realli help know stuff,fun professor really helpful know stuff
2,Marshall Levett,Counseling department,5.0,2,5.0,1.0,Such a easy class. It\'s simple. Do your homework and pay attention and you will fly right by or be the person that blames him for not leaarning. He wont let you fail. just ask for help....,easi class simpl homework pay attent fli right person blame leaarn wont let fail ask help,easy class simple homework pay attention fly right person blame leaarning wont let fail ask help
3,Soazig Le Bihan,Philosophy department,3.6,4,5.0,5.0,"A very hard class, and a massive amount of work. But, Soazig is also very good about explaining difficult concepts, gives excellent feedback, and is very accessible for extra assistance.",hard class massiv amount work soazig also good explain difficult concept give excel feedback access extra assist,hard class massive amount work soazig also good explaining difficult concept give excellent feedback accessible extra assistance
4,Soazig Le Bihan,Philosophy department,3.6,4,1.0,4.0,"Took 100 level class for Ethics offered online as an option to fill a core requirement She was terrible! Did not seem to have a grasp of the English language nor does she seem to have a grasp on reality as she insisted many times that failure in an ENTRY LEVEL, OPTIONAL class is very common due to the ""difficulty"" of material, very full of herself",took level class ethic offer onlin option fill core requir terribl seem grasp english languag seem grasp realiti insist mani time failur entri level option class common due difficulti materi full,took level class ethic offered online option fill core requirement terrible seem grasp english language seem grasp reality insisted many time failure entry level optional class common due difficulty material full


The variable that I am primarily focused on predicting using comments is the star rating of the professor's overall quality. This is a continuous numerical variable, however it can be conceptually broken up into quality categories. According to RMP’s official standard, a rating of 3.5-5.0 is good, 2.5-3.4 is average and 1.0-2.4 is poor. I would like to determine the terms that distinguish the highest performing professors so I would like to see whether we could structure this initial model as a binary classification problem by dividing the ratings into 'good' and 'bad' with ratings greater than or equal to 3.5 (=>3.5) being categorized as 'good' (1) and ratings lower than 3.5 (<3.5) being categorized as 'bad' (0).  

In [40]:
#Create a new binary column 'rating_result' where:
#1 represents ratings that are greater than or equal to 3.5 (considered "good" or positive).
#0 represents ratings that are less than 3.5 (considered "bad" or negative).

df['rating_result'] = (df['star_rating'] >= 3.5).astype(int)
rating_result_counts = df['rating_result'].value_counts()

# Display the counts
print(rating_result_counts)

rating_result
1    13301
0     6283
Name: count, dtype: int64


There are 13301 'good' ratings and 6283 'bad' ratings. This means that 68% of the data set is composed of high ratings. We will use sklearn.model_selection.train_test_split() to split the dataset into validation and training subsets.

In [42]:
# Train-Test Split
train_data, test_data, train_labels, test_labels = train_test_split(
    df['tokens_lemm'], df['rating_result'], test_size=0.2, random_state=42
)

# Display the counts in the training set
training_counts = train_labels.value_counts()
print("Training Set:")
print(training_counts)

# Display the counts in the testing set
test_counts = test_labels.value_counts()
print("\nTesting Set:")
print(test_counts)

Training Set:
rating_result
1    10616
0     5051
Name: count, dtype: int64

Testing Set:
rating_result
1    2685
0    1232
Name: count, dtype: int64


The distribution of 'good' and 'bad' ratings in the training and test sets reflect that initial data set.