# Steam Reviews Dataset EDA

## 1. Setup and Data Loading

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings

# Configure visualization settings
plt.style.use('ggplot')
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# Display settings
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
# Download NLTK resources
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
# https://drive.google.com/file/d/15yHHDXb7JhYW09IQJ90qUi2SHgJk3QyS/view?usp=sharing
file_path = "https://drive.google.com/uc?export=download&id=15yHHDXb7JhYW09IQJ90qUi2SHgJk3QyS"

try:
    df = pd.read_csv(file_path)
except Exception as e:
    print(f"Error loading CSV with default settings: {e}")
    # Try with different encoding options
    try:
        df = pd.read_csv(file_path, encoding='latin1')
        print("Successfully loaded with latin1 encoding")
    except:
        try:
            df = pd.read_csv(file_path, encoding='utf-8-sig')
            print("Successfully loaded with utf-8-sig encoding")
        except Exception as e2:
            print(f"Still encountering issues: {e2}")
            # Try reading just the first few lines to diagnose
            import subprocess
            print("File preview:")
            !head -n 5 {file_path}

# Display the first few rows to verify loading
df.head()

Unnamed: 0,"<!DOCTYPE html><html><head><title>Google Drive - Virus scan warning</title><meta http-equiv=""content-type"" content=""text/html; charset=utf-8""/><style nonce=""3WSUK1_wicqFP-RtMwOPbg"">.goog-link-button{position:relative;color:#15c;text-decoration:underline;cursor:pointer}.goog-link-button-disabled{color:#ccc;text-decoration:none;cursor:default}body{color:#222;font:normal 13px/1.4 arial",sans-serif;margin:0}.grecaptcha-badge{visibility:hidden}.uc-main{padding-top:50px;text-align:center}#uc-dl-icon{display:inline-block;margin-top:16px;padding-right:1em;vertical-align:top}#uc-text{display:inline-block;max-width:68ex;text-align:left}.uc-error-caption,".uc-warning-caption{color:#222;font-size:16px}#uc-download-link{text-decoration:none}.uc-name-size a{color:#15c;text-decoration:none}.uc-name-size a:visited{color:#61c;text-decoration:none}.uc-name-size a:active{color:#d14836;text-decoration:none}.uc-footer{color:#777;font-size:11px;padding-bottom:5ex;padding-top:5ex;text-align:center}.uc-footer a{color:#15c}.uc-footer a:visited{color:#61c}.uc-footer a:active{color:#d14836}.uc-footer-divider{color:#ccc;width:100%}.goog-inline-block{position:relative;display:-moz-inline-box;display:inline-block}* html .goog-inline-block{display:inline}*:first-child+html .goog-inline-block{display:inline}sentinel{}</style><link rel=""icon"" href=""//ssl.gstatic.com/docs/doclist/images/drive_2022q3_32dp.png""/></head><body><div class=""uc-main""><div id=""uc-dl-icon"" class=""image-container""><div class=""drive-sprite-aux-download-file""></div></div><div id=""uc-text""><p class=""uc-warning-caption"">Google Drive can't scan this file for viruses.</p><p class=""uc-warning-subcaption""><span class=""uc-name-size""><a href=""/open?id=15yHHDXb7JhYW09IQJ90qUi2SHgJk3QyS"">dataset.csv</a> (2.0G)</span> is too large for Google to scan for viruses. Would you still like to download this file?</p><form id=""download-form"" action=""https://drive.usercontent.google.com/download"" method=""get""><input type=""submit"" id=""uc-download-link"" class=""goog-inline-block jfk-button jfk-button-action"" value=""Download anyway""/><input type=""hidden"" name=""id"" value=""15yHHDXb7JhYW09IQJ90qUi2SHgJk3QyS""><input type=""hidden"" name=""export"" value=""download""><input type=""hidden"" name=""confirm"" value=""t""><input type=""hidden"" name=""uuid"" value=""3951184f-a93a-457a-8eec-f9ccf3969e0b""></form></div></div><div class=""uc-footer""><hr class=""uc-footer-divider""></div></body></html>"


## 2. Initial Data Exploration

In [None]:
# Check the shape of the dataset
print(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")

# Display column information
print("\nColumn Names:")
print(df.columns.tolist())

# Display data types and memory usage
print("\nData Types and Memory Usage:")
df.info()

In [None]:
# Basic statistics for numeric columns
print("Descriptive Statistics for Numeric Columns:")
df.describe(include=[np.number]).T

In [None]:
# Basic statistics for object columns
print("Descriptive Statistics for Text Columns:")
df.describe(include=['object']).T

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

print("Missing Values Analysis:")
missing_df[missing_df['Missing Values'] > 0]  # Only show columns with missing values

In [None]:
# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count} ({(duplicate_count/len(df))*100:.2f}% of the dataset)")

# If duplicates exist, show a few examples
if duplicate_count > 0:
    print("\nExample of duplicate entries:")
    df[df.duplicated(keep='first')].head()

## 3. Game Analysis

In [None]:
# Count of reviews by game
game_counts = df['app_name'].value_counts().reset_index()
game_counts.columns = ['app_name', 'review_count']

# Display the top games by review count
print("Top 15 Games by Review Count:")
game_counts.head(15)

In [None]:
# Visualize top games by review count
plt.figure(figsize=(12, 8))
top_games = game_counts.head(10).sort_values('review_count')

sns.barplot(data=top_games, y='app_name', x='review_count', palette='viridis')
plt.title('Top 10 Games by Number of Reviews', fontsize=16)
plt.xlabel('Number of Reviews', fontsize=14)
plt.ylabel('Game', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Calculate average review score by game
game_avg_scores = df.groupby('app_name')['review_score'].agg(['mean', 'count']).reset_index()
game_avg_scores = game_avg_scores.rename(columns={'mean': 'avg_score', 'count': 'review_count'})
game_avg_scores = game_avg_scores.sort_values('avg_score', ascending=False)

# Filter to games with a minimum number of reviews for statistical significance
min_reviews = 10
top_rated_games = game_avg_scores[game_avg_scores['review_count'] >= min_reviews].head(15)
print(f"Top 15 Highest-Rated Games (with at least {min_reviews} reviews):")
top_rated_games

In [None]:
# Visualize top-rated games (with a minimum number of reviews)
plt.figure(figsize=(12, 8))
top_10_rated = top_rated_games.head(10).sort_values('avg_score')

colors = sns.color_palette("RdYlGn", 10)
ax = sns.barplot(data=top_10_rated, y='app_name', x='avg_score', palette=colors)

# Add review count as text
for i, (score, count) in enumerate(zip(top_10_rated['avg_score'], top_10_rated['review_count'])):
    ax.text(score + 0.01, i, f"n={count}", va='center')

plt.title(f'Top 10 Highest-Rated Games (with ≥{min_reviews} reviews)', fontsize=16)
plt.xlabel('Average Review Score', fontsize=14)
plt.ylabel('Game', fontsize=14)
plt.xlim(0, 1.1)  # Assuming scores are between 0 and 1
plt.tight_layout()
plt.show()

In [None]:
# Lowest rated games
lowest_rated_games = game_avg_scores[game_avg_scores['review_count'] >= min_reviews].tail(15).sort_values('avg_score')
print(f"15 Lowest-Rated Games (with at least {min_reviews} reviews):")
lowest_rated_games

In [None]:
# Visualize lowest-rated games
plt.figure(figsize=(12, 8))
bottom_10_rated = lowest_rated_games.head(10).sort_values('avg_score', ascending=False)

colors = sns.color_palette("RdYlGn_r", 10)
ax = sns.barplot(data=bottom_10_rated, y='app_name', x='avg_score', palette=colors)

# Add review count as text
for i, (score, count) in enumerate(zip(bottom_10_rated['avg_score'], bottom_10_rated['review_count'])):
    ax.text(score + 0.01, i, f"n={count}", va='center')

plt.title(f'10 Lowest-Rated Games (with ≥{min_reviews} reviews)', fontsize=16)
plt.xlabel('Average Review Score', fontsize=14)
plt.ylabel('Game', fontsize=14)
plt.xlim(0, 1.0)  # Assuming scores are between 0 and 1
plt.tight_layout()
plt.show()

In [None]:
# Distribution of average review scores across all games
plt.figure(figsize=(12, 6))
sns.histplot(game_avg_scores['avg_score'], bins=20, kde=True)
plt.axvline(game_avg_scores['avg_score'].mean(), color='red', linestyle='dashed', linewidth=1)
plt.text(game_avg_scores['avg_score'].mean() + 0.02, plt.gca().get_ylim()[1] * 0.8, 
         f'Mean: {game_avg_scores["avg_score"].mean():.2f}', color='red')

plt.title('Distribution of Average Review Scores Across Games', fontsize=16)
plt.xlabel('Average Review Score', fontsize=14)
plt.ylabel('Count of Games', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of review count vs. average score
plt.figure(figsize=(12, 8))
sns.scatterplot(data=game_avg_scores, x='review_count', y='avg_score', alpha=0.6, size='review_count', 
                sizes=(20, 200), hue='avg_score', palette='RdYlGn')

# Add labels to notable points (high review count or extreme scores)
for idx, row in game_avg_scores[game_avg_scores['review_count'] > game_avg_scores['review_count'].quantile(0.95)].iterrows():
    plt.annotate(row['app_name'], (row['review_count'], row['avg_score']), 
                 xytext=(5, 5), textcoords='offset points', fontsize=9)

plt.title('Review Count vs. Average Score by Game', fontsize=16)
plt.xlabel('Number of Reviews', fontsize=14)
plt.ylabel('Average Review Score', fontsize=14)
plt.xscale('log')  # Log scale for better visualization if there's high variance
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 4. Review Score Analysis

In [None]:
# Distribution of review scores
score_counts = df['review_score'].value_counts().sort_index().reset_index()
score_counts.columns = ['review_score', 'count']
score_counts['percentage'] = (score_counts['count'] / len(df)) * 100

print("Distribution of Review Scores:")
score_counts

In [None]:
# Visualize the distribution of review scores
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=score_counts, x='review_score', y='count', palette='viridis')

# Add percentage labels on top of bars
for i, row in score_counts.iterrows():
    ax.text(i, row['count'], f"{row['percentage']:.1f}%", ha='center', va='bottom')

plt.title('Distribution of Review Scores', fontsize=16)
plt.xlabel('Review Score', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 5. Review Votes Analysis

In [None]:
# Distribution of review votes
print("Distribution of Review Votes:")
vote_stats = df['review_votes'].describe()
vote_stats

In [None]:
# Create bins for review votes to better visualize the distribution
max_votes_to_display = df['review_votes'].quantile(0.99)  # Ignore extreme outliers
df_for_plot = df[df['review_votes'] <= max_votes_to_display]

plt.figure(figsize=(12, 6))
sns.histplot(df_for_plot['review_votes'], bins=30, kde=False)
plt.title(f'Distribution of Review Votes (excluding top 1% outliers)', fontsize=16)
plt.xlabel('Number of Votes', fontsize=14)
plt.ylabel('Count of Reviews', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Relationship between review score and votes
vote_by_score = df.groupby('review_score')['review_votes'].agg(['mean', 'median', 'count']).reset_index()
print("Average and Median Votes by Review Score:")
vote_by_score

In [None]:
# Visualize relationship between review score and votes
plt.figure(figsize=(10, 6))
ax = sns.barplot(data=vote_by_score, x='review_score', y='mean', palette='viridis')

# Add review count as text
for i, count in enumerate(vote_by_score['count']):
    ax.text(i, vote_by_score['mean'].iloc[i] + 0.1, f"n={count:,}", ha='center')

plt.title('Average Review Votes by Score', fontsize=16)
plt.xlabel('Review Score', fontsize=14)
plt.ylabel('Average Number of Votes', fontsize=14)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Reviews with most votes
top_voted_reviews = df.sort_values('review_votes', ascending=False).head(10)
print("Top 10 Most Voted Reviews:")
top_voted_reviews[['app_name', 'review_score', 'review_votes', 'review_text']]

## 6. Review Text Analysis

In [None]:
# Add column for review text length
df['review_length'] = df['review_text'].apply(lambda x: len(str(x)))
df['review_word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

# Basic statistics on review text length
print("Review Text Length Statistics:")
length_stats = df[['review_length', 'review_word_count']].describe().T
length_stats

In [None]:
# Distribution of review lengths
plt.figure(figsize=(12, 6))
sns.histplot(df['review_length'], bins=50, kde=True)
plt.axvline(df['review_length'].mean(), color='red', linestyle='dashed', linewidth=1)
plt.text(df['review_length'].mean() + 10, plt.gca().get_ylim()[1] * 0.8, 
         f'Mean: {df["review_length"].mean():.1f}', color='red')

plt.title('Distribution of Review Text Length (Characters)', fontsize=16)
plt.xlabel('Number of Characters', fontsize=14)
plt.ylabel('Count of Reviews', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of review word counts
plt.figure(figsize=(12, 6))
sns.histplot(df['review_word_count'], bins=50, kde=True)
plt.axvline(df['review_word_count'].mean(), color='red', linestyle='dashed', linewidth=1)
plt.text(df['review_word_count'].mean() + 2, plt.gca().get_ylim()[1] * 0.8, 
         f'Mean: {df["review_word_count"].mean():.1f}', color='red')

plt.title('Distribution of Review Word Count', fontsize=16)
plt.xlabel('Number of Words', fontsize=14)
plt.ylabel('Count of Reviews', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Compare review length by score
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='review_score', y='review_word_count', palette='viridis')
plt.title('Review Word Count by Score', fontsize=16)
plt.xlabel('Review Score', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Compare review length by votes
# Create vote bins for better analysis
df['vote_bins'] = pd.qcut(
    df['review_votes'], 
    q=[0, 0.25, 0.5, 0.75, 0.9, 0.95, 1.0], 
    labels=['0-25%', '25-50%', '50-75%', '75-90%', '90-95%', '95-100%']
)

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='vote_bins', y='review_word_count', palette='viridis')
plt.title('Review Word Count by Vote Percentile', fontsize=16)
plt.xlabel('Vote Percentile', fontsize=14)
plt.ylabel('Word Count', fontsize=14)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Function for text preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return []
    
    # Convert to lowercase and remove non-alphanumeric
    text = re.sub(r'[^\w\s]', '', text.lower())
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    return tokens

In [None]:
# Word frequency analysis for all reviews
# Sample a subset if the dataset is large
sample_size = min(10000, len(df))
sample_df = df.sample(sample_size, random_state=42)

# Process the sampled text
all_words = []
for text in sample_df['review_text']:
    all_words.extend(preprocess_text(text))

# Count word frequencies
word_freq = Counter(all_words)
common_words = word_freq.most_common(30)

print("Most Common Words in Reviews:")
for word, count in common_words:
    print(f"{word}: {count}")

In [None]:
# Visualize most common words
plt.figure(figsize=(14, 8))
words, counts = zip(*common_words)
sns.barplot(x=list(counts), y=list(words), palette='viridis')
plt.title('30 Most Common Words in Reviews', fontsize=16)
plt.xlabel('Frequency', fontsize=14)
plt.ylabel('Word', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100, 
                     colormap='viridis', collocations=False).generate_from_frequencies(word_freq)

plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Review Text', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Separate reviews by score and analyze word frequencies
positive_reviews = sample_df[sample_df['review_score'] == 1]['review_text']
negative_reviews = sample_df[sample_df['review_score'] == 0]['review_text']

positive_words = []
for text in positive_reviews:
    positive_words.extend(preprocess_text(text))
    
negative_words = []
for text in negative_reviews:
    negative_words.extend(preprocess_text(text))

positive_freq = Counter(positive_words)
negative_freq = Counter(negative_words)

# Top positive words
print("Most Common Words in Positive Reviews:")
for word, count in positive_freq.most_common(15):
    print(f"{word}: {count}")
    
print("\nMost Common Words in Negative Reviews:")
for word, count in negative_freq.most_common(15):
    print(f"{word}: {count}")

In [None]:
# Compare top words in positive and negative reviews
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# Positive reviews
pos_words, pos_counts = zip(*positive_freq.most_common(15))
sns.barplot(x=list(pos_counts), y=list(pos_words), palette='Greens_r', ax=ax1)
ax1.set_title('Top Words in Positive Reviews', fontsize=16)
ax1.set_xlabel('Frequency', fontsize=14)
ax1.set_ylabel('Word', fontsize=14)
ax1.grid(True, alpha=0.3)

# Negative reviews
neg_words, neg_counts = zip(*negative_freq.most_common(15))
sns.barplot(x=list(neg_counts), y=list(neg_words), palette='Reds_r', ax=ax2)
ax2.set_title('Top Words in Negative Reviews', fontsize=16)
ax2.set_xlabel('Frequency', fontsize=14)
ax2.set_ylabel('Word', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Generate word clouds for positive and negative reviews
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# Positive reviews wordcloud
positive_cloud = WordCloud(width=800, height=400, background_color='white', max_words=100, 
                         colormap='Greens', collocations=False).generate_from_frequencies(positive_freq)
ax1.imshow(positive_cloud, interpolation='bilinear')
ax1.axis('off')
ax1.set_title('Word Cloud - Positive Reviews', fontsize=16)

# Negative reviews wordcloud
negative_cloud = WordCloud(width=800, height=400, background_color='white', max_words=100, 
                         colormap='Reds', collocations=False).generate_from_frequencies(negative_freq)
ax2.imshow(negative_cloud, interpolation='bilinear')
ax2.axis('off')
ax2.set_title('Word Cloud - Negative Reviews', fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
# Calculate the distinctive words for positive and negative reviews
def get_distinctive_words(freq1, freq2, top_n=15):
    # Create sets of all words
    all_words = set(freq1.keys()) | set(freq2.keys())
    
    # Calculate the ratio of frequencies
    distinctive_words = {}
    for word in all_words:
        # Get frequencies with a minimum of 1 to avoid division by zero
        freq1_count = freq1.get(word, 1)
        freq2_count = freq2.get(word, 1)
        
        # Only consider words that appear at least 5 times in one category
        if freq1_count >= 5 or freq2_count >= 5:
            # Calculate ratio (normalized by total words in each category)
            ratio = (freq1_count / sum(freq1.values())) / (freq2_count / sum(freq2.values()))
            distinctive_words[word] = ratio
    
    # Sort and get top distinctive words
    return sorted(distinctive_words.items(), key=lambda x: x[1], reverse=True)[:top_n]

# Get distinctive words for positive and negative reviews
positive_distinctive = get_distinctive_words(positive_freq, negative_freq)
negative_distinctive = get_distinctive_words(negative_freq, positive_freq)

print("Words More Common in Positive Reviews:")
for word, ratio in positive_distinctive:
    print(f"{word}: {ratio:.2f}x more common")

print("\nWords More Common in Negative Reviews:")
for word, ratio in negative_distinctive:
    print(f"{word}: {ratio:.2f}x more common")

In [None]:
# Visualize distinctive words
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# Positive distinctive words
pos_dist_words, pos_dist_ratios = zip(*positive_distinctive)
sns.barplot(x=list(pos_dist_ratios), y=list(pos_dist_words), palette='Greens_r', ax=ax1)
ax1.set_title('Words More Common in Positive Reviews', fontsize=16)
ax1.set_xlabel('Frequency Ratio (compared to negative reviews)', fontsize=14)
ax1.set_ylabel('Word', fontsize=14)
ax1.grid(True, alpha=0.3)

# Negative distinctive words
neg_dist_words, neg_dist_ratios = zip(*negative_distinctive)
sns.barplot(x=list(neg_dist_ratios), y=list(neg_dist_words), palette='Reds_r', ax=ax2)
ax2.set_title('Words More Common in Negative Reviews', fontsize=16)
ax2.set_xlabel('Frequency Ratio (compared to positive reviews)', fontsize=14)
ax2.set_ylabel('Word', fontsize=14)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Relationships between Variables

In [None]:
# Correlation between numeric variables
numeric_df = df[['review_score', 'review_votes', 'review_length', 'review_word_count']]
correlation = numeric_df.corr()

print("Correlation Between Numeric Variables:")
correlation

In [None]:
# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, 
            square=True, linewidths=.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix of Numeric Variables', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of review length vs votes with score as color
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df.sample(min(5000, len(df)), random_state=42), 
                x='review_word_count', y='review_votes', 
                hue='review_score', palette={0: 'red', 1: 'green'}, alpha=0.6)
plt.title('Relationship between Review Length and Votes', fontsize=16)
plt.xlabel('Review Word Count', fontsize=14)
plt.ylabel('Number of Votes', fontsize=14)
plt.legend(title='Review Score')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Key Insights and Conclusion

### Summary of Findings:

1. **Review Score Distribution:**
   - The dataset shows [to be filled after analysis] distribution of positive vs. negative reviews
   - [Game-specific insights about highest and lowest rated games]

2. **Review Votes Patterns:**
   - [Insights about which reviews get more votes]
   - [Any correlation between review score and votes]

3. **Review Text Analysis:**
   - [Insights about review length]
   - [Common words and themes in positive vs negative reviews]
   - [Any interesting patterns in review language]

4. **Game-specific Insights:**
   - [Insights about most reviewed games]
   - [Patterns across different games/genres]

5. **Relationships Between Variables:**
   - [Key correlations discovered]
   - [Any surprising relationships]

### Recommendations for Further Analysis:

1. **Sentiment Analysis:** Apply more advanced NLP techniques to better understand nuanced sentiments in reviews.
2. **Topic Modeling:** Identify common themes and topics within reviews using techniques like LDA.
3. **Temporal Analysis:** If timestamp data is available, analyze how reviews change over time.
4. **Game Category Analysis:** Group games by genre/category to identify category-specific patterns.
5. **Predictive Modeling:** Build models to predict review scores or votes based on text content.

This exploratory analysis provides a foundation for understanding user sentiment in Steam reviews, highlighting patterns in how users evaluate games and what aspects tend to influence their opinions.