In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from dash import Dash, dcc, html
import webbrowser
from threading import Timer

In [2]:
# Download VADER lexicon for sentiment analysis
nltk.download('vader_lexicon', quiet=True)

True

In [3]:
# Function to open browser automatically
def open_browser():
    webbrowser.open_new("http://127.0.0.1:8050/")

In [4]:
# Step 1: Load the Dataset
apps_df = pd.read_csv(r"C:\Users\gayuu\Desktop\NullClass\Play Store Data.csv")
reviews_df = pd.read_csv(r"C:\Users\gayuu\Desktop\NullClass\User Reviews.csv")


In [5]:
# Step 2: Data Cleaning
print("Cleaning data...")
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
    apps_df[column].fillna(apps_df[column].mode()[0], inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df = apps_df[apps_df['Rating'] <= 5]
reviews_df.dropna(subset=['Translated_Review'], inplace=True)


Cleaning data...


In [6]:
# Step 3: Data Transformation
print("Transforming data...")
apps_df['Reviews'] = apps_df['Reviews'].astype(int)
apps_df['Installs'] = apps_df['Installs'].str.replace(',', '').str.replace('+', '').astype(int)
apps_df['Price'] = apps_df['Price'].str.replace('$', '').astype(float)

def convert_size(size):
    if isinstance(size, str):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
    return np.nan

apps_df['Size'] = apps_df['Size'].apply(convert_size)


Transforming data...


In [7]:
# Add log_installs and log_reviews columns
apps_df['Log_Installs'] = np.log1p(apps_df['Installs'])
apps_df['Log_Reviews'] = np.log1p(apps_df['Reviews'])


In [8]:
# Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return '4-5 stars'
    elif rating >= 3:
        return '3-4 stars'
    elif rating >= 2:
        return '2-3 stars'
    else:
        return 'Below 2 stars'

apps_df['Rating_Group'] = apps_df['Rating'].apply(rating_group)



In [9]:
# Add Revenue column
apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']

# Sentiment Analysis

In [10]:
# Sentiment Analysis
print("Performing sentiment analysis...")
sia = SentimentIntensityAnalyzer()
reviews_df['Sentiment_Score'] = reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])


Performing sentiment analysis...


In [11]:
# Extract year from 'Last Updated' and create 'Year' column
apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'], errors='coerce')
apps_df['Year'] = apps_df['Last Updated'].dt.year

In [12]:
# Create the Dash app with external stylesheets for better appearance
app = Dash(__name__,
           meta_tags=[
               {"name": "viewport", "content": "width=device-width, initial-scale=1"}
           ])

In [13]:
# Set the title that appears in the browser tab
app.title = "App Store Data Dashboard"

In [14]:
# Create visualizations inside a function that can be called when app starts
def create_layout():
    print("Creating visualizations and layout...")
    # Try-except blocks for each visualization to prevent one error from breaking the entire dashboard
    
    # Visualization 1 - Sentiment Distribution
    try:
        top_categories = apps_df.groupby('Category').filter(lambda x: x['Reviews'].sum() > 1000).groupby('Category').size().nlargest(5).index
        filtered_reviews = reviews_df[reviews_df['App'].isin(apps_df[apps_df['Category'].isin(top_categories)]['App'])]
        filtered_reviews = pd.merge(filtered_reviews, apps_df[['App', 'Rating_Group']], on='App', how='left')
        filtered_reviews['Sentiment_Group'] = pd.cut(filtered_reviews['Sentiment_Score'], bins=[-1, -0.05, 0.05, 1], labels=['Negative', 'Neutral', 'Positive'])
        sentiment_counts = filtered_reviews.groupby(['Sentiment_Group', 'Rating_Group']).size().unstack(fill_value=0)
        fig1 = px.bar(sentiment_counts, title='Sentiment Distribution by Rating Group',
                      labels={'value': 'Count', 'Sentiment_Group': 'Sentiment', 'Rating_Group': 'Rating Group'},
                      color_discrete_sequence=px.colors.qualitative.Plotly)
        fig1.update_layout(barmode='stack')
        viz1 = html.Div(id='sentiment_distribution', children=[
            html.H3("Sentiment Analysis by Rating Group", style={'textAlign': 'center'}),
            dcc.Graph(figure=fig1)
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})
    except Exception as e:
        print(f"Error in visualization 1: {e}")
        viz1 = html.Div(id='sentiment_distribution', children=[
            html.H3("Sentiment Analysis by Rating Group", style={'textAlign': 'center'}),
            html.P(f"Error loading visualization: {str(e)}")
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})

    # Visualization 2 - Dual-Axis Chart for Free vs Paid Apps
    try:
        filtered_apps = apps_df[(apps_df['Installs'] > 10000) & (apps_df['Revenue'] > 10000)]
        # Apply more filters conditionally to avoid errors
        if 'Android Ver' in filtered_apps.columns:
            filtered_apps = filtered_apps[filtered_apps['Android Ver'].str.extract('(\d+\.\d+)', expand=False).astype(float, errors='ignore') > 4.0]
        if 'Size' in filtered_apps.columns:
            filtered_apps = filtered_apps[filtered_apps['Size'] > 15]
        if 'Content Rating' in filtered_apps.columns:
            filtered_apps = filtered_apps[filtered_apps['Content Rating'] == 'Everyone']

        filtered_apps = filtered_apps[filtered_apps['App'].str.len() <= 30]

        top_categories = filtered_apps['Category'].value_counts().nlargest(3).index
        filtered_apps = filtered_apps[filtered_apps['Category'].isin(top_categories)]
        avg_installs = filtered_apps.groupby('Type')['Installs'].mean().reset_index()
        avg_revenue = filtered_apps.groupby('Type')['Revenue'].mean().reset_index()
        fig2 = px.bar(avg_installs, x='Type', y='Installs', title='Average Installs for Free vs Paid Apps',
                      labels={'Installs': 'Average Installs'}, color='Type')
        fig2.add_scatter(x=avg_revenue['Type'], y=avg_revenue['Revenue'], mode='lines+markers', name='Average Revenue', yaxis='y2')
        fig2.update_layout(yaxis2=dict(title='Average Revenue', overlaying='y', side='right'))
        viz2 = html.Div(id='installs_revenue', children=[
            html.H3("Average Installs and Revenue: Free vs Paid Apps", style={'textAlign': 'center'}),
            dcc.Graph(figure=fig2)
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})
    except Exception as e:
        print(f"Error in visualization 2: {e}")
        viz2 = html.Div(id='installs_revenue', children=[
            html.H3("Average Installs and Revenue: Free vs Paid Apps", style={'textAlign': 'center'}),
            html.P(f"Error loading visualization: {str(e)}")
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})

    # Visualization 3 - Grouped Bar Chart for Average Rating and Total Review Count
    try:
        filtered_apps = apps_df[(apps_df['Size'] > 10) & (apps_df['Last Updated'].dt.month == 1)]
        avg_rating = filtered_apps.groupby('Category')['Rating'].mean().reset_index()
        total_reviews = filtered_apps.groupby('Category')['Reviews'].sum().reset_index()
        combined = pd.merge(avg_rating, total_reviews, on='Category')
        combined = combined[combined['Rating'] >= 4.0]
        fig3 = px.bar(combined, x='Category', y=['Rating', 'Reviews'], title='Average Rating and Total Review Count by Category',
                      labels={'value': 'Count', 'variable': 'Metric'})
        viz3 = html.Div(id='rating_review_count', children=[
            html.H3("Average Rating and Total Reviews by Category", style={'textAlign': 'center'}),
            dcc.Graph(figure=fig3)
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})
    except Exception as e:
        print(f"Error in visualization 3: {e}")
        viz3 = html.Div(id='rating_review_count', children=[
            html.H3("Average Rating and Total Reviews by Category", style={'textAlign': 'center'}),
            html.P(f"Error loading visualization: {str(e)}")
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})

    # Visualization 4 - Time Series Line Chart for Total Installs
    try:
        filtered_apps = apps_df.copy()
        if 'Content Rating' in filtered_apps.columns:
            filtered_apps = filtered_apps[filtered_apps['Content Rating'] == 'Teen']
        filtered_apps = filtered_apps[filtered_apps['App'].str.startswith('E', na=False) & (filtered_apps['Installs'] > 10000)]
        installs_trend = filtered_apps.groupby(['Year'])['Installs'].sum().reset_index()
        fig4 = px.line(installs_trend, x='Year', y='Installs', title='Total Installs Over Time')
        viz4 = html.Div(id='installs_trend', children=[
            html.H3("Total Installs Trend Over Time", style={'textAlign': 'center'}),
            dcc.Graph(figure=fig4)
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})
    except Exception as e:
        print(f"Error in visualization 4: {e}")
        viz4 = html.Div(id='installs_trend', children=[
            html.H3("Total Installs Trend Over Time", style={'textAlign': 'center'}),
            html.P(f"Error loading visualization: {str(e)}")
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})

    # Visualization 5 - Violin Plot for Ratings Distribution
    try:
        filtered_apps = apps_df[apps_df['App'].str.contains('C', na=False) & (apps_df['Reviews'] > 10) & (apps_df['Rating'] < 4.0)]
        category_counts = filtered_apps['Category'].value_counts()
        valid_categories = category_counts[category_counts > 50].index
        filtered_apps = filtered_apps[filtered_apps['Category'].isin(valid_categories)]
        fig5 = px.violin(filtered_apps, y='Rating', x='Category', title='Distribution of Ratings by Category', box=True, points='all')
        viz5 = html.Div(id='ratings_distribution', children=[
            html.H3("Distribution of Ratings by Category", style={'textAlign': 'center'}),
            dcc.Graph(figure=fig5)
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})
    except Exception as e:
        print(f"Error in visualization 5: {e}")
        viz5 = html.Div(id='ratings_distribution', children=[
            html.H3("Distribution of Ratings by Category", style={'textAlign': 'center'}),
            html.P(f"Error loading visualization: {str(e)}")
        ], style={'backgroundColor': 'white', 'padding': '15px', 'borderRadius': '5px', 'boxShadow': '0px 0px 10px rgba(0,0,0,0.1)', 'marginBottom': '20px'})

    # Compose full layout
    layout = html.Div([
        html.H1("Play Store Data Analysis Dashboard", style={'textAlign': 'center', 'color': '#2c3e50', 'marginBottom': '30px'}),
        viz1, viz2, viz3, viz4, viz5
    ], style={'backgroundColor': '#f0f2f5', 'fontFamily': 'Arial', 'padding': '20px'})
    
    return layout


In [15]:
# Define the app layout
app.layout = create_layout()


Creating visualizations and layout...


In [16]:
# Main function to run the app
if __name__ == '__main__':
    print("Starting dashboard server...")
    # Open browser after a short delay to ensure server is running
    Timer(1.5, open_browser).start()  # Increased delay to 1.5 seconds
    # Run the server
    app.run_server(debug=True, port=8050)  # Changed to debug=True to see any issues
    print("Dashboard is running. If browser didn't open automatically, navigate to http://127.0.0.1:8050/")

Starting dashboard server...


Dashboard is running. If browser didn't open automatically, navigate to http://127.0.0.1:8050/
