In [None]:
#VADER ANALYSIS
import nltk
nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()
df.head() # The Content column contains the reviews that need to be analyzed.
#Content is what we will complete a sentiment analysis on. 

In [None]:
# Check the contents of the first review for an initial review 
text1 = df.iloc[0].Content
text1

In [None]:
# run the polarity scores on the 1st review to make sure the analyzer runs properly. 
analyzer.polarity_scores(text1) 
#{'neg': 0.0, 'neu': 0.873, 'pos': 0.127, 'compound': 0.4939}

In [None]:
# The following code saves the absolute compound polarity scores in the column Polarity_Score and the sentiment label (positive or negative) in a column called Sentiment 
compounds=[]
values=[]
for index, row in df.iterrows():
    text = row.Content
    scores = analyzer.polarity_scores(text)
    compounds.append(scores['compound'])
    if scores['compound']>0:
        values.append('POSITIVE')
    else:
        values.append('NEGATIVE')

df['Polarity_Score']=compounds
df['Polarity_Score']=df['Polarity_Score'].round(3)
df['Sentiment']=values

In [None]:
df.head(100)

In [None]:
#FLAIR
import flair
from flair.models import TextClassifier
from flair.data import Sentence
import pandas as pd
df1= pd.read_csv('merged_file.csv').head(100)

classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('The food was mid.')
classifier.predict(sentence)
print(sentence)

In [None]:
#flair values
values2 = []
scores = []

for index, row in df1.iterrows():
    text = Sentence(row.Content)
    classifier.predict(text)
    values2.append(text.labels[0].value)   # 'POSITIVE' or 'NEGATIVE'
    scores.append(text.labels[0].score)    # confidence (0.0 to 1.0)

df1['Sentiment'] = values2
df1['Polarity'] = scores #Scores are absolute instead of [-1,1] from Vader
df1

In [None]:
#The Vader values are a little bit more accurate for the Content column versus the Flair style. 
#Example: 
#Row 4: The Platform is about as subtle as a punch in the face but that's by design. It's social commentary via blunt instrument using genre trappings and pitch-black satire as a club bashing at its targets with barely restrained glee.
#Row 98: Crow and his two fine Welsh-burred leads commit fully to the anguished nerve-fraying cause but their efforts can't conceal a certain thinness to the dramatic material...
#Row 100: Many audiences are going to be utterly shocked at the lengths the two leading actors Pattinson Dafoe are pushed. They have a loathsome dynamic. However those who can get on 
#  Eggers' level will be hypnotized by this eccentric experiment.

#The flair analysis designates these ratings as negative, but after reading the rating itself, I would consider these as more positive than negative. 
#Thus, we will use Vader as the sentiment analysis reasoning. 

In [None]:
#impute the NAs of Reviewer_Rating with the polarity score
df['Reviewer_Rating'] = df['Reviewer_Rating'].fillna(df.pop('Polarity_Score'))
#this code replaces the Reviewer_Rating with Polarity Score and deletes Polarity score column. Retain the sentiment column for now. 

In [None]:
#Written to CSV
df.to_csv('merged_file.csv', index=False)

In [None]:
# END OF PART 3
# START OF PART 4

In [None]:
## 1. Use regression algorithms to predict movie success (the factors you selected in Phase 2) 
## based on  relevant variables (IVs). Make sure  you include the reviewer rating variable with NAs being imputed either as an IV or DV in your model analysis (20 pts)

## 1. Use regression algorithms to predict movie success (the factors you selected in Phase 2) 
## based on  relevant variables (IVs). Make sure  you include the reviewer rating variable with NAs being imputed either as an IV or DV in your model analysis (20 pts)

In [None]:
# make a copy of the original DataFrame
df = pd.read_csv('merged_file.csv', index_col=0)
df_copy = df.copy()

# create dummy variables for Month and Genre columns
dummy_month = pd.get_dummies(df_copy['Month'],drop_first=True,dtype='int')
dummy_genre = pd.get_dummies(df_copy['Genre_1'],drop_first=True,dtype='int')

# concatenate the dummy variables with the original DataFrame
df_copy = pd.concat([df_copy, dummy_month, dummy_genre], axis=1)
df_copy

In [None]:
# remove the original Month and Genre columns
df_copy = df_copy.drop(['Month', 'Genre_1', 'Genre_2', 'Genre_3'], axis=1)

# drop rows with missing or infinite values
df_copy = df_copy.replace([np.inf, -np.inf], np.nan).dropna()

# filter the DataFrame to include only numeric columns
df_copy = df_copy.select_dtypes(include='number')
df_copy

# MODEL 1 - OLS Linear Regression, Revenue as DV

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy.drop(columns=['Revenue'])
y = df_copy['Revenue']
# add constant term to X matrix
X = sm.add_constant(X)
# fit OLS model
model1 = sm.OLS(y, X).fit()
# print summary of model
print(model1.summary())

In [None]:
# Check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
print(vif)

### OLS results for **Revenue** (no stepwise selection, no standardization)  
**Adj. R² = 0.820** **N = 97 423** F-statistic ≈ 1.1 × 10⁴ p(F) < 0.001  

The model explains 82 % of the variance in box-office revenue. The condition number is large (≈ 1.9 × 10¹¹), but all VIFs are below 3, indicating no serious multicollinearity. Numerical scaling or ridge regularisation would still improve stability.

---

#### Continuous predictors  

| Variable          | Coefficient (USD)         | p-value | Comment |
|-------------------|--------------------------:|--------:|---------|
| Budget (per \$1 M)       | **+2.59 M** | <0.001 | Strong, direct driver of revenue |
| Vote_Count (per 1 000)   | **+59 k**  | <0.001 | Audience reach matters |
| Vote_Average              | **–17.9 M** | <0.001 | Higher‐scoring films tend to be limited-release or niche |
| Tomato_Meter              | **–4.81 M** | <0.001 | Critical acclaim alone does not boost revenue |
| Popularity                | **–87.8 k** | <0.001 | TMDB “buzz” is negatively correlated with revenue in this set |
| Review_Object_Year        | **+14.1 M** | <0.001 | Later reviews (younger titles) earn more |
| ROI                       | **+0.61**  | <0.001 | Positive but small monetary effect |
| Runtime                   | –15.7 k    | 0.52   | Not significant |
| Reviewer_Rating           | +1.28 M    | 0.17   | Not significant |
| Critic_ID                 | –1.51 k    | 0.10   | Marginal |

---

#### Seasonality (April is the reference month)  

| Month       | Δ Revenue (Millions) | Significant |
|-------------|---------------------:|:-----------:|
| August      | –42.7 | Yes |
| December    | –47.7 | Yes |
| February    | –79.3 | Yes |
| January     | –48.6 | Yes |
| July        | –44.6 | Yes |
| June        | –14.3 | Yes |
| March       | –63.5 | Yes |
| May         | –62.2 | Yes |
| November    | –58.8 | Yes |
| October     | –61.6 | Yes |
| September   | –41.4 | Yes |

Releases outside April under-perform, with February and March showing the steepest declines.

---

#### Genre effects (Drama/unclassified is baseline)  

| Genre            | Δ Revenue (Millions) | Significant |
|------------------|---------------------:|:-----------:|
| Adventure        | +112.7 | Yes |
| Animation        | +72.6  | Yes |
| Comedy           | +31.1  | Yes |
| Documentary      | +65.3  | Yes |
| Fantasy          | +13.2  | Yes |
| History          | +10.5  | Yes |
| Horror           | +28.8  | Yes |
| Music            | +67.6  | Yes |
| Not available    | +44.1  | Yes |
| Romance          | +78.8  | Yes |
| Thriller         | +29.6  | Yes |
| TV Movie         | +54.2  | Yes |
| Action           | +31.1  | Yes |
| Crime            | –21.1  | Yes |
| Mystery          | –16.7  | Yes |
| Science Fiction  | –128.9 | Yes |
| War              | –53.3  | Yes |
| Western          | –23.5  | Yes |
| Family           | –8.0   | 0.035 |

Family, Crime, Sci-Fi, War, and Western genres depress revenue, while Adventure, Animation, Romance, and Documentary show the highest positive impact.

---

### Interpretation

* **Budget** and **audience engagement** (Vote_Count) remain the strongest positive predictors.  
* High **critic/audience scores** (Vote_Average, Tomato_Meter) correlate negatively with revenue, suggesting a divide between critical reception and commercial performance.  
* **April releases** outperform every other month; films launched in late winter (February–March) perform poorest.  
* Genre choice is pivotal: big-budget, crowd-pleasing genres (Adventure, Animation, Romance) add tens of millions, whereas niche or serious genres (Science Fiction in this data, War, Western) reduce expected revenue.  

### Next steps

1. Standardise predictors or apply Ridge/Lasso to curb numerical warnings.  
2. Test interaction terms (e.g., Budget × Genre) to capture genre-specific ROI.  
3. Explore non-linear models or tree-based methods for potential gains beyond the 82 % variance explained here.

In [None]:
# New Model 1 - Standardized - Revenue as DV
df_copy

In [None]:
# Define predictor and dependent variables
X = df_copy[['Popularity', 'Vote_Count',
        'Tomato_Meter', 'Reviewer_Rating', 
        'April', 'August', 'December', 'February', 'January', 'June',
        'March', 'May', 'November', 'October', 'September', 'Action',
        'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
        'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
        'Romance', 'Science Fiction', 'Thriller', 'War', 'Western']] 

y = df['Revenue']

# Remove TV movie because it's not statistically significant
# Remove Drama because its correlation with Revenue is only -.15 AND it has a problematic VIF of 25.27
# Remove July because its correlation is only .03 and is not statistically significant
# Remove Budget because it has multicollinearity with Vote_Count. Vote_Count has a higher correlation at .82, so remove Budget 
# Remove Runtime and Vote_Average due to multicollinearity 

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Add constant term to X matrix
X_scaled = sm.add_constant(X_scaled)

# Fit OLS model
model = sm.OLS(y, X_scaled).fit()

# Print summary of model results
print(model.summary())

In [None]:
# Check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
print(vif)

### Output results (revenue model with standardization, but no stepwise): 
##### The condition number is under 30. the standardization issue has been resolved. 
##### The model explains 73.1% of the variance in the revenue of movies. 
const: This is the intercept term. It represents the expected value of revenue when all other predictors are set to zero. Since all predictors have been standardized to have a mean of zero and a standard deviation of one, this intercept represents the expected revenue when all predictors are at their mean values.

Popularity: For each standard deviation increase in popularity, we expect a decrease in revenue by 3.3 million dollars, holding all other predictors constant.

Vote_Count: For each standard deviation increase in vote count, we expect an increase in revenue by 214.4 million dollars, holding all other predictors constant.

Tomato_Meter: For each standard deviation increase in the Tomato Meter rating, we expect a decrease in revenue by 5.9 million dollars, holding all other predictors constant.

Reviewer_Rating: For each standard deviation increase in the reviewer rating, we expect a decrease in revenue by 1.7 million dollars, holding all other predictors constant.

April: For each standard deviation increase in April, we expect an increase in revenue by 10.2 million dollars, holding all other predictors constant.

August: For each standard deviation increase in August, we expect a decrease in revenue by 4.7 million dollars, holding all other predictors constant.

December: For each standard deviation increase in December, we expect a decrease in revenue by 3.4 million dollars, holding all other predictors constant.

February: For each standard deviation increase in February, we expect a decrease in revenue by 12.8 million dollars, holding all other predictors constant.

January: For each standard deviation increase in January, we expect a decrease in revenue by 4.8 million dollars, holding all other predictors constant.

June: For each standard deviation increase in June, we expect an increase in revenue by 6.6 million dollars, holding all other predictors constant.

March: For each standard deviation increase in March, we expect a decrease in revenue by 8.5 million dollars, holding all other predictors constant.

May: For each standard deviation increase in May, we expect a decrease in revenue by 6.4 million dollars, holding all other predictors constant.

November: For each standard deviation increase in November, we expect a decrease in revenue by 11.3 million dollars, holding all other predictors constant.

October: For each standard deviation increase in October, we expect a decrease in revenue by 13.6 million dollars, holding all other predictors constant.

September: For each standard deviation increase in September, we expect a decrease in revenue by 5.1 million dollars, holding all other predictors constant.

Action: For each standard deviation increase in the Action genre, we expect an increase in revenue by 9.6 million dollars, holding all other predictors constant.

Adventure: For each standard deviation increase in the Adventure genre, we expect an increase in revenue by 40.8 million dollars, holding all other predictors constant.

Animation: For each standard deviation increase in the Animation genre, we expect an increase in revenue by 12.5 million dollars, holding all other predictors constant.

Comedy: For each standard deviation increase in the Comedy genre, we expect a decrease in revenue by 2.2 million dollars, holding all other predictors constant.

Crime: For each standard deviation increase in the Crime genre, we expect a decrease in revenue by 16.8 million dollars, holding all other predictors constant.

Documentary to Western are similar 

# New Model 1 - Forward Stepwise Standardized - Revenue as DV
##### The model below has an oveall good fit with an R-squared value of 0.821. However, there might be some issues of multicollinearity as suggested by the high condition number (1.14e+09) which might impact the reliability of the coefficient estimates. 

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Reviewer_Rating','Popularity', 'Budget',  'Vote_Count', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Action', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
# Removed vote_average. vif of 40
# Removed drama. vif of 37
# Removed runtime. vif of 14

y = df_copy['Revenue']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
# check vifs 
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

### Output results (revenue model with standardization AND stepwise): 
##### The condition number is under 30. the standardization issue has been resolved. 
##### The model explains 81.8% of the variance in the revenue of movies. 
The constant term (intercept) of the model is 119,100,000, which represents the expected revenue when all the standardized predictor variables are equal to zero. Since this is not a meaningful scenario for most of the predictor variables, the intercept term should be interpreted with caution.

The coefficient for Vote_Count is 150,300,000, which means that a one-standard-deviation increase in Vote_Count is associated with an increase in revenue of 150,300,000 dollars, while holding all other predictors constant.ncreasing the number of votes a movie receives by one standard deviation is associated with a large increase in revenue. For example, a one-standard-deviation increase in Vote_Count is equivalent to increasing the number of votes a movie receives from 10,000 to 100,000. Based on our model, this would correspond to an increase in revenue of approximately 150 million.

The coefficient for Adventure is 12,780,000, which means that a one-standard-deviation increase in Adventure is associated with an increase in revenue of 12,780,000 dollars, while holding all other predictors constant.

The coefficient for Crime is -11,460,000, which means that a one-standard-deviation increase in Crime is associated with a decrease in revenue of 11,460,000 dollars, while holding all other predictors constant.

The coefficient for Budget is 119,400,000, which means that a one-standard-deviation increase in Budget is associated with an increase in revenue of 119,400,000 dollars, while holding all other predictors constant. Increasing the budget of a movie by one standard deviation is associated with a large increase in revenue. For example, a one-standard-deviation increase in Budget is equivalent to increasing the budget of a movie from $10 million to $100 million. Based on our model, this would correspond to an increase in revenue of approximately $119 million.

The coefficient for Popularity is -15,180,000, which means that a one-standard-deviation increase in Popularity is associated with a decrease in revenue of 15,180,000 dollars, while holding all other predictors constant.

The coefficient for April is 10,670,000, which means that a one-standard-deviation increase in April is associated with an increase in revenue of 10,670,000 dollars, while holding all other predictors constant.

The coefficient for Science Fiction is -27,130,000, which means that a one-standard-deviation increase in Science Fiction is associated with a decrease in revenue of 27,130,000 dollars, while holding all other predictors constant.

The coefficient for June is 7,227,000, which means that a one-standard-deviation increase in June is associated with an increase in revenue of 7,227,000 dollars, while holding all other predictors constant.

The coefficient for Action is -11,540,000, which means that a one-standard-deviation increase in Action is associated with a decrease in revenue of 11,540,000 dollars, while holding all other predictors constant.

The coefficient for War is -8,225,000, which means that a one-standard-deviation increase in War is associated with a decrease in revenue of 8,225,000 dollars, while holding all other predictors constant.

The coefficient for Mystery is -5,515,000, which means that a one-standard-deviation increase in Mystery is associated with a decrease in revenue of 5,515,000 dollars, while holding all other predictors constant.

The coefficient for Family is -5,504,000, which means that a one-standard-deviation increase in Family is associated with a decrease in revenue of 5,504,000 dollars, while holding all other predictors constant.

Romance: A one-standard-deviation increase in Romance is associated with a decrease in revenue of 5.187 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in Romance is equivalent to increasing the proportion of romantic movies in the dataset from 23% to 56%. Based on our model, this would correspond to a decrease in revenue of approximately 5.2 million.

History: A one-standard-deviation increase in History is associated with a decrease in revenue of 4.429 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in History is equivalent to increasing the proportion of historical movies in the dataset from 6% to 39%. Based on our model, this would correspond to a decrease in revenue of approximately 4.4 million.

February: A one-standard-deviation increase in February is associated with a decrease in revenue of 6.388 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in February is equivalent to increasing the proportion of movies released in February from 6% to 39%. Based on our model, this would correspond to a decrease in revenue of approximately 6.4 million.

Documentary: A one-standard-deviation increase in Documentary is associated with an increase in revenue of 4.505 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in Documentary is equivalent to increasing the proportion of documentary movies in the dataset from 2% to 35%. Based on our model, this would correspond to an increase in revenue of approximately 4.5 million.

Western: A one-standard-deviation increase in Western is associated with a decrease in revenue of 4.142 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in Western is equivalent to increasing the proportion of western movies in the dataset from 1% to 34%. Based on our model, this would correspond to a decrease in revenue of approximately 4.1 million.

September: A one-standard-deviation increase in September is associated with an increase in revenue of 1.212 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in September is equivalent to increasing the proportion of movies released in September from 8% to 41%. Based on our model, this would correspond to an increase in revenue of approximately 1.2 million.

Tomato_Meter: A one-standard-deviation increase in Tomato_Meter is associated with a decrease in revenue of 3.9 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in Tomato_Meter is equivalent to increasing the average rating of movies in the dataset from 33 to 74. Based on our model, this would correspond to a decrease in revenue of approximately 3.9 million.

Fantasy: A one-standard-deviation increase in Fantasy is associated with a decrease in revenue of 3.268 million dollars, while holding all other predictors constant. For example, a one-standard-deviation increase in Fantasy is equivalent to increasing the proportion of fantasy movies in the dataset from 11% to 44%. Based on our model, this would correspond to a decrease in revenue of approximately 3.3 million.

The coefficient for Comedy is -2,534,000, which means that a one-standard-deviation increase in the proportion of Comedy movies is associated with a decrease in revenue of approximately 2.5 million dollars, while holding all other predictors constant. In other words, if the proportion of Comedy movies released increased by one standard deviation, this would correspond to a decrease in revenue of approximately 2.5 million dollars, based on our model.

October: A one-standard-deviation increase in the proportion of movies released in October is associated with a decrease in revenue of approximately $5.4 million, holding all other predictors constant. This suggests that releasing a movie in October may not be as profitable as releasing it in other months.

November: A one-standard-deviation increase in the proportion of movies released in November is associated with a decrease in revenue of approximately $5.0 million, holding all other predictors constant. This suggests that releasing a movie in November may not be as profitable as releasing it in other months.

March: A one-standard-deviation increase in the proportion of movies released in March is associated with a decrease in revenue of approximately $4.4 million, holding all other predictors constant. This suggests that releasing a movie in March may not be as profitable as releasing it in other months.

May: A one-standard-deviation increase in the proportion of movies released in May is associated with a decrease in revenue of approximately $4.4 million, holding all other predictors constant. This suggests that releasing a movie in May may not be as profitable as releasing it in other months.

Animation: A one-standard-deviation increase in the proportion of animated movies is associated with an increase in revenue of approximately $3.4 million, holding all other predictors constant. This suggests that releasing more animated movies may lead to higher revenue.

Music: A one-standard-deviation increase in the proportion of movies with a music genre is associated with an increase in revenue of approximately $2.4 million, holding all other predictors constant. This suggests that including more music genres in movies may lead to higher revenue.

December: A one-standard-deviation increase in the proportion of movies released in December is associated with a decrease in revenue of approximately $1.7 million, holding all other predictors constant. This suggests that releasing a movie in December may not be as profitable as releasing it in other months.

Thriller: A one-standard-deviation increase in the proportion of thriller movies is associated with a decrease in revenue of approximately $1.1 million, holding all other predictors constant. This suggests that including more thriller genres in movies may lead to lower revenue.

# MODEL 2 - OLS Linear Regression, Vote_Average as DV

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Budget', 'Runtime', 'Popularity', 'Revenue',
       'Vote_Count', 'Reviewer_Rating',
       'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]

y = df_copy['Vote_Average']

# add constant term to X matrix
X = sm.add_constant(X)

# fit OLS model
model2 = sm.OLS(y, X).fit()

# print summary of model
print(model2.summary())

In [1]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

NameError: name 'pd' is not defined

### Output results (Vote_Average Model, no standardization or stepwise): 
##### The condition number is above 30. standardization needs to occur. There are also severe multicollinearity issues, but we can resolve these in a standardized stepwise model. 
##### The model explains 35.3% of the variance in the revenue of movies. 
const: This is the intercept term of the model and represents the predicted value of the response variable (Vote_Average) when all predictor variables are equal to 0. In this case, the intercept is 5.6913.

Budget: For every unit increase in budget, the predicted vote average decreases by -1.658e-09. This suggests that higher budget movies may not necessarily lead to higher ratings.

Runtime: For every unit increase in runtime, the predicted vote average increases by 0.0073. This suggests that longer movies tend to have higher ratings.

Popularity: For every unit increase in popularity, the predicted vote average increases by 0.0002. This suggests that more popular movies tend to have higher ratings.

Revenue: For every unit increase in revenue, the predicted vote average decreases by -5.033e-10. This suggests that higher revenue movies may not necessarily lead to higher ratings.

Vote_Count: For every unit increase in vote count, the predicted vote average increases by 0.0002. This suggests that movies with more votes tend to have higher ratings.

Reviewer_Rating: For every unit increase in reviewer rating, the predicted vote average increases by 0.1468. This suggests that movies with higher reviewer ratings tend to have higher ratings.

Tomato_Meter: For every unit increase in tomato meter score, the predicted vote average increases by 0.3265. This suggests that movies with higher tomato meter scores tend to have higher ratings.

April, August, December, February, January, July, June, March, May, November, October, September: These are dummy variables for each month. The coefficients represent the difference in predicted vote average compared to January (the reference category) for each respective month. For example, the coefficient for December is 0.2565, which means that movies released in December tend to have higher ratings than movies released in January.

Action, Adventure, Animation, Comedy, Crime, Documentary, Drama, Family, Fantasy, History, Horror, Music, Mystery, Romance, Science Fiction, TV Movie, Thriller, War, Western: These are dummy variables for each genre. The coefficients represent the difference in predicted vote average compared to Drama (the reference category) for each respective genre. For example, the coefficient for Animation is 0.3883, which means that animation movies tend to have higher ratings than drama movies.

# New Model 2 - Forward Stepwise Standardized - Vote_Average as DV
##### The R-squared value of 0.322 indicates that approximately 32.2% of the variation in Vote_Average is explained by the independent variables in the model.

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[[  'Popularity', 'Budget', 'Vote_Count',
        'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]

#remove drama, vif is 36
#remove revenue to reduce vifs from 6 to under 5 for everything 
#runtime, vif 13

y = df_copy['Vote_Average']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
# check vifs
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

### Output results (Vote_Average Model, stepwise and standardized): 
##### The condition number is below 30. standardization issue and multicollinearity issues have been resolved. 
##### The model explains 32.2% of the variance in the\ Vote_Average of movies. 
Vote_Count: For each 1 standard deviation increase in Vote_Count, we can expect an increase of 0.39 in Vote_Average. This suggests that having more votes can be a positive factor in determining the success of a movie.

Documentary: For each 1 standard deviation increase in the number of documentary films released, we can expect an increase of 0.11 in Vote_Average. This suggests that the documentary genre can be a positive factor in determining the success of a movie.

Animation: For each 1 standard deviation increase in the number of animation films released, we can expect an increase of 0.07 in Vote_Average. This suggests that the animation genre can be a positive factor in determining the success of a movie.

Action: For each 1 standard deviation increase in the number of action films released, we can expect a decrease of 0.12 in Vote_Average. This suggests that the action genre may not be a positive factor in determining the success of a movie.

Horror: For each 1 standard deviation increase in the number of horror films released, we can expect a decrease of 0.14 in Vote_Average. This suggests that the horror genre may not be a positive factor in determining the success of a movie.

Thriller: For each 1 standard deviation increase in the number of thriller films released, we can expect a decrease of 0.10 in Vote_Average. This suggests that the thriller genre may not be a positive factor in determining the success of a movie.

Reviewer_Rating: For each 1 standard deviation increase in the reviewer rating, we can expect an increase of 0.07 in Vote_Average. This suggests that positive reviews can be a positive factor in determining the success of a movie.

Science Fiction: For each 1 standard deviation increase in the number of science fiction films released, we can expect a decrease of 0.08 in Vote_Average. This suggests that the science fiction genre may not be a positive factor in determining the success of a movie.

Tomato_Meter: For each 1 standard deviation increase in the Tomato Meter rating, we can expect an increase of 0.14 in Vote_Average. This suggests that high ratings on Rotten Tomatoes can be a positive factor in determining the success of a movie.

February: For each 1 standard deviation increase in the number of movies released in February, we can expect a decrease of 0.07 in Vote_Average. This suggests that February may not be a favorable month for movie releases.

Budget: For each 1 standard deviation increase in budget, we can expect a decrease of 0.11 in Vote_Average. This suggests that high budgets may not necessarily result in higher success for a movie.

Comedy: For each 1 standard deviation increase in the number of comedy films released, we can expect a decrease of 0.06 in Vote_Average. This suggests that the comedy genre may not be a positive factor in determining the success of a movie.

Mystery: For each 1 standard deviation increase in the number of mystery films released, we can expect a decrease of 0.06 in Vote_Average. This suggests that the mystery genre may not be a positive factor in determining the success of a movie.

Popularity: For each 1 standard deviation increase in popularity, we can expect an increase of 0.05 in Vote_Average. This suggests that popular movies may have higher success.

November: For every 1 standard deviation increase in the number of movies released in November, the vote average of a movie is expected to increase by 0.0246 points.

April: For every 1 standard deviation increase in the number of movies released in April, the vote average of a movie is expected to decrease by 0.0573 points.

Fantasy: For every 1 standard deviation increase in the proportion of fantasy movies released, the vote average of a movie is expected to decrease by 0.0378 points.

December: For every 1 standard deviation increase in the number of movies released in December, the vote average of a movie is expected to increase by 0.0199 points.

Crime: For every 1 standard deviation increase in the proportion of crime movies released, the vote average of a movie is expected to decrease by 0.0293 points.

October: The coefficient for October is not statistically significant, which means we cannot say with confidence that there is a relationship between the number of movies released in October and the vote average of a movie.

Romance: For every 1 standard deviation increase in the proportion of romance movies released, the vote average of a movie is expected to increase by 0.0302 points.

Music: For every 1 standard deviation increase in the proportion of music movies released, the vote average of a movie is expected to increase by 0.0205 points.

History: For every 1 standard deviation increase in the proportion of history movies released, the vote average of a movie is expected to increase by 0.0201 points.

Family: For every 1 standard deviation increase in the proportion of family movies released, the vote average of a movie is expected to increase by 0.0183 points.

July: For each 1 standard deviation increase in the number of releases in July, the movie's vote average is expected to decrease by 0.013 points, all other factors held constant.

March: For each 1 standard deviation increase in the number of releases in March, the movie's vote average is expected to decrease by 0.034 points, all other factors held constant.

January: For each 1 standard deviation increase in the number of releases in January, the movie's vote average is expected to decrease by 0.033 points, all other factors held constant.

September: For each 1 standard deviation increase in the number of releases in September, the movie's vote average is expected to decrease by 0.034 points, all other factors held constant.

June: For each 1 standard deviation increase in the number of releases in June, the movie's vote average is expected to decrease by 0.030 points, all other factors held constant.

May: For each 1 standard deviation increase in the number of releases in May, the movie's vote average is expected to decrease by 0.026 points, all other factors held constant.

August: For each 1 standard deviation increase in the number of releases in August, the movie's vote average is expected to decrease by 0.018 points, all other factors held constant.

War: For each 1 standard deviation increase in the number of war movies released, the movie's vote average is expected to increase by 0.010 points, all other factors held constant.

# MODEL 3 - OLS Linear Regression, Vote_Count as DV

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[['Budget', 'Runtime', 'Popularity', 'Revenue',
       'Vote_Average', 'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
y = df_copy['Vote_Count']

# add constant term to X matrix
X = sm.add_constant(X)

# fit OLS model
model3 = sm.OLS(y, X).fit()

# print summary of model
print(model3.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

### Output results (Vote_Count Model, no standardization or stepwise): 
##### The condition number is above 30. standardization needs to occur. There are also severe multicollinearity issues, but we can resolve these in a standardized stepwise model. 
##### The model explains 77.2% of the variance in the revenue of movies. 
const: This is the intercept term of the regression model, which represents the average vote count for movies that have a budget of zero, a runtime of zero minutes, zero revenue, zero popularity, zero vote average, zero reviewer rating, zero tomato meter, and are released in January with no genre.
Budget: A one-unit increase in the budget is associated with a decrease of 2.965e-06 in the vote count, holding all other variables constant.
Runtime: A one-minute increase in runtime is associated with an increase of 11.3984 in the vote count, holding all other variables constant.
Popularity: A one-unit increase in the popularity score is associated with an increase of 0.1706 in the vote count, holding all other variables constant.
Revenue: A one-dollar increase in revenue is associated with an increase of 7.691e-06 in the vote count, holding all other variables constant.
Vote_Average: A one-point increase in the vote average is associated with an increase of 614.8021 in the vote count, holding all other variables constant.
Reviewer_Rating: A one-point increase in the reviewer rating is associated with an increase of 58.3942 in the vote count, holding all other variables constant.
Tomato_Meter: A one-point increase in the tomato meter is associated with an increase of 80.6092 in the vote count, holding all other variables constant.
April: A movie released in April is associated with an increase of 144.1554 in the vote count, holding all other variables constant.
August: A movie released in August is associated with a decrease of 84.8860 in the vote count, holding all other variables constant.
December: A movie released in December is associated with an increase of 129.8916 in the vote count, holding all other variables constant.
February: A movie released in February is associated with an increase of 782.3857 in the vote count, holding all other variables constant.
January: A movie released in January is associated with an increase of 184.0077 in the vote count, holding all other variables constant.
July: A movie released in July is associated with an increase of 43.5438 in the vote count, holding all other variables constant.
June: A movie released in June is associated with a decrease of 64.2642 in the vote count, holding all other variables constant.
March: A movie released in March is associated with an increase of 359.1890 in the vote count, holding all other variables constant.
May: A movie released in May is associated with an increase of 293.4497 in the vote count, holding all other variables constant.
November: A movie released in November is associated with an increase of 211.6036 in the vote count, holding all other variables constant.
October: A movie released in October is associated with an increase of 456.8653 in the vote count, holding all other variables constant.
September: A movie released in September is associated with an increase of 74.0855 in the vote count, holding all other variables constant.
Action: A movie classified as "Action" is associated with an increase of 1198.3069 in the vote count, holding all other variables constant.
Adventure: A movie classified as "Adventure" is associated with an increase of 27.5924 in the vote count, holding all other variables constant.
Animation: A movie classified as "Animation" is associated with a decrease of 116.9353 in the vote count, holding all other variables constant.

# New Model 3 - Forward Stepwise Standardized - Vote_Count as DV
##### The R-squared value of 0.735 indicates that approximately 73.5% of the variation in Vote_Count is explained by the independent variables in the model.

In [None]:
print(X.shape)
print(y.shape)
y.reset_index(drop=True, inplace=True)

In [None]:
# create independent variable matrix X and dependent variable vector y
X = df_copy[[ 'Popularity', 'Revenue',
        'Reviewer_Rating', 'Tomato_Meter',
       'April', 'August', 'December', 'February', 'January', 'July', 'June',
       'March', 'May', 'November', 'October', 'September', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller',
       'War', 'Western']]
# remove vote_average, vif of 40
# remove drama, vif of 36
#remove budget, vif of 6
#remove runtime, vif 13
y = df_copy['Vote_Count']

# Standardize predictor variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with standardized X and original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Forward stepwise regression function
def forward_selection(X, y, threshold_in=0.01, verbose=True):
    included = []
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(X[included+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
        if not changed:
            break
    model = sm.OLS(y, sm.add_constant(X[included])).fit()
    return model

# Perform forward stepwise selection and print the model summary
model = forward_selection(X_scaled_df, y)
print(model.summary())


In [None]:
#check vifs 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
                 index=X.columns)
print("VIFs:")
print(vifs)

### Output results (Vote_Count Model, stepwise and standardized): 
##### The condition number is below 30. standardization issue and multicollinearity issues have been resolved. 
##### The model explains 74.8% of the variance in the Vote_Count of movies. 
Revenue: A 1 standard deviation increase in Revenue (which is around $348,772) is associated with an increase in Vote_Count by 2,165 votes.

Crime: A 1 standard deviation increase in Crime movies (which is about 0.30) is associated with an increase in Vote_Count by 282 votes.

Documentary: A 1 standard deviation increase in Documentary movies (which is about 0.23) is associated with a decrease in Vote_Count by 176 votes.

Action: A 1 standard deviation increase in Action movies (which is about 0.26) is associated with an increase in Vote_Count by 278 votes.

Science Fiction: A 1 standard deviation increase in Science Fiction movies (which is about 0.24) is associated with an increase in Vote_Count by 292 votes.

Tomato_Meter: A 1 standard deviation increase in Tomato_Meter (which is about 20) is associated with an increase in Vote_Count by 154 votes.

February: Releasing a movie in February is associated with an increase in Vote_Count by 181 votes compared to releasing a movie in a different month.

October: Releasing a movie in October is associated with an increase in Vote_Count by 198 votes compared to releasing a movie in a different month.

Romance: A 1 standard deviation increase in Romance movies (which is about 0.28) is associated with an increase in Vote_Count by 127 votes.

Mystery: A 1 standard deviation increase in Mystery movies (which is about 0.28) is associated with an increase in Vote_Count by 127 votes.

War: A 1 standard deviation increase in War movies (which is about 0.28) is associated with an increase in Vote_Count by 101 votes.

Horror: A 1 standard deviation increase in Horror movies (which is about 0.26) is associated with an increase in Vote_Count by 124 votes.

Thriller: A 1 standard deviation increase in Thriller movies (which is about 0.26) is associated with an increase in Vote_Count by 105 votes.

Music: A 1 standard deviation increase in Music movies (which is about 0.23) is associated with an increase in Vote_Count by 92 votes.

November: Releasing a movie in November is associated with an increase in Vote_Count by 144 votes compared to releasing a movie in a different month.

Reviewer_Rating: A 1 standard deviation increase in Reviewer_Rating (which is about 1.3) is associated with an increase in Vote_Count by 81 votes.

December: Releasing a movie in December is associated with an increase in Vote_Count by 104 votes compared to releasing a movie in a different month.

March: Releasing a movie in March is associated with an increase in Vote_Count by 107 votes compared to releasing a movie in a different month.

May: Releasing a movie in May is associated with an increase in Vote_Count by 89 votes compared to releasing a movie in a different month.

Comedy: A 1 standard deviation increase in Comedy movies (which is about 0.31) is associated with an increase in Vote_Count by 48 votes.

Popularity: A 1 standard deviation increase in Popularity (which is about 5.89) is associated with an increase in Vote_Count by 62 votes.

January: Releasing a movie in January is associated with an increase in Vote_Count by 68 votes compared to releasing a movie in a different month.

History: For every 1 standard deviation increase in the number of historical movies in a film, the average number of votes increases by approximately 38. This means that if a movie has 30 more historical movies than the average movie in the dataset, it is expected to receive around 1,140 more votes on average.

Animation: For every 1 standard deviation increase in the number of animated movies in a film, the average number of votes decreases by approximately 50. This means that if a movie has 30 more animated movies than the average movie in the dataset, it is expected to receive around 1,500 fewer votes on average.

Adventure: For every 1 standard deviation increase in the number of adventure movies in a film, the average number of votes decreases by approximately 46. This means that if a movie has 30 more adventure movies than the average movie in the dataset, it is expected to receive around 1,380 fewer votes on average.

September: For every 1 standard deviation increase in the release month of September, the average number of votes increases by approximately 57. This means that if a movie is released in September rather than the average month in the dataset, it is expected to receive around 1,710 more votes on average.

Western: For every 1 standard deviation increase in the number of western movies in a film, the average number of votes increases by approximately 31. This means that if a movie has 30 more western movies than the average movie in the dataset, it is expected to receive around 930 more votes on average.

July: For every 1 standard deviation increase in the release month of July, the average number of votes increases by approximately 42. This means that if a movie is released in July rather than the average month in the dataset, it is expected to receive around 1,260 more votes on average.

April: For every 1 standard deviation increase in the release month of April, the average number of votes increases by approximately 28. This means that if a movie is released in April rather than the average month in the dataset, it is expected to receive around 840 more votes on average.

Fantasy: For every 1 standard deviation increase in the number of fantasy movies in a film, the average number of votes decreases by approximately 17. This means that if a movie has 30 more fantasy movies than the average movie in the dataset, it is expected to receive around 510 fewer votes on average.

TV Movie: For every 1 standard deviation increase in the number of TV movies in a film, the average number of votes decreases by approximately 11. This means that if a movie has 30 more TV movies than the average movie in the dataset, it is expected to receive around 330 fewer votes on average.

In [None]:
# Calculate mean and standard deviation of 'Vote_Count'
mean_vote_count = df_copy['Vote_Count'].mean()
std_vote_count = df_copy['Vote_Count'].std()

# Display one standard deviation above and below the mean
print(f'Mean Vote_Count: {mean_vote_count:.2f}')
print(f'Standard Deviation of Vote_Count: {std_vote_count:.2f}')
print(f'One Standard Deviation Above Mean Vote_Count: {mean_vote_count + std_vote_count:.2f}')
print(f'One Standard Deviation Below Mean Vote_Count: {mean_vote_count - std_vote_count:.2f}')