In [1]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pytz
from datetime import datetime
import os
import re

In [4]:
import os
os.getcwd()

'd:\\Internship\\Microsoft VS Code'

In [2]:
apps_df=pd.read_csv('Play Store Data.csv')
reviews_df=pd.read_csv('User Reviews.csv')

In [3]:
def is_time_to_show_graph():
    """Checks if the current time is between 1 PM and 2 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=13, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=14, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the dual-axis chart and saves it to an HTML file."""
    
    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app dashboard is only available between 1 PM and 2 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_dashboard.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 1 PM and 2 PM IST.")
        return

    # Get the path to the uploaded CSV file
    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return

    try:
        # Load the dataframe
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    if apps_df.empty:
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # Data Cleaning and Preprocessing
    
    # Drop rows with missing values in key columns
    apps_df.dropna(subset=['Installs', 'Price', 'Size', 'Content Rating', 'Android Ver', 'Category', 'App'], inplace=True)
    
    # Clean and convert 'Installs' to a numeric type
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False).astype(int)
    
    # Clean and convert 'Price' to a numeric type
    apps_df['Price'] = apps_df['Price'].str.replace('$', '', regex=False).astype(float)
    
    # Clean and convert 'Size' to MB
    def convert_size(size):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        return None
    apps_df['Size'] = apps_df['Size'].apply(convert_size)
    apps_df.dropna(subset=['Size'], inplace=True)
    
    # Clean and convert 'Android Ver' to a numeric type
    def convert_android_ver(ver):
        try:
            return float(ver.split(' ')[0])
        except (ValueError, IndexError):
            return None
    apps_df['Android Ver'] = apps_df['Android Ver'].apply(convert_android_ver)
    apps_df.dropna(subset=['Android Ver'], inplace=True)
    
    # Calculate Revenue
    apps_df['Revenue'] = apps_df['Price'] * apps_df['Installs']
    
    # Apply Filters
    filtered_df = apps_df[
        (apps_df['Installs'] > 10000) &
        (apps_df['Revenue'] > 10000) &
        (apps_df['Android Ver'] > 4.0) &
        (apps_df['Size'] > 15) &
        (apps_df['Content Rating'] == 'Everyone') &
        (apps_df['App'].str.len() <= 30)
    ].copy()
    
    if filtered_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_dashboard.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return
        
    # Get the top 3 categories by total installs
    top_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(3).index
    top_3_df = filtered_df[filtered_df['Category'].isin(top_categories)]
    
    # Group by category and app type to get averages
    grouped_df = top_3_df.groupby(['Category', 'Type']).agg(
        Average_Installs=('Installs', 'mean'),
        Average_Revenue=('Revenue', 'mean')
    ).reset_index()

    # Create figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add bar chart for Average Installs
    for app_type in grouped_df['Type'].unique():
        df_subset = grouped_df[grouped_df['Type'] == app_type]
        fig.add_trace(
            go.Bar(
                x=df_subset['Category'],
                y=df_subset['Average_Installs'],
                name=f'Average Installs ({app_type})',
                marker_color='blue' if app_type == 'Free' else 'orange',
            ),
            secondary_y=False,
        )

    # Add line chart for Average Revenue on the secondary y-axis
    for app_type in grouped_df['Type'].unique():
        df_subset = grouped_df[grouped_df['Type'] == app_type]
        fig.add_trace(
            go.Scatter(
                x=df_subset['Category'],
                y=df_subset['Average_Revenue'],
                name=f'Average Revenue ({app_type})',
                mode='lines+markers',
                marker=dict(size=10, symbol='circle'),
                line=dict(width=4),
            ),
            secondary_y=True,
        )
    
    # Update layout and titles
    fig.update_layout(
        title_text='Average Installs and Revenue for Top 3 App Categories (Free vs. Paid)',
        xaxis_title='App Category',
        legend_title='Metric & Type',
        template='plotly_white',
        barmode='group'
    )
    
    fig.update_yaxes(title_text="Average Installs", secondary_y=False, rangemode='tozero')
    fig.update_yaxes(title_text="Average Revenue ($)", secondary_y=True, rangemode='tozero')
    
    # Generate the HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
    
    # Save the HTML to a file
    with open("app_dashboard.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_dashboard.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard not generated. It is not between 1 PM and 2 PM IST.


In [4]:
def is_time_to_show_graph():
    """Checks if the current time is between 3 PM and 5 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=15, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=17, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the grouped bar chart and saves it to an HTML file."""
    
    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app dashboard is only available between 3 PM and 5 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_ratings_reviews_chart.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 3 PM and 5 PM IST.")
        return

    # Get the path to the uploaded CSV file
    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return

    try:
        # Load the dataframe
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    if apps_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">The uploaded data is empty or invalid.</h1>
            <p>Please provide a valid Play Store data CSV file.</p>
        </div>
        """
        with open("app_ratings_reviews_chart.html", "w") as f:
            f.write(html_content)
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # Data Cleaning and Preprocessing
    
    # Drop rows with missing values in key columns
    apps_df.dropna(subset=['Rating', 'Reviews', 'Size', 'Last Updated', 'Category', 'Installs'], inplace=True)
    
    # Clean and convert 'Reviews' to a numeric type
    apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
    apps_df.dropna(subset=['Reviews'], inplace=True)
    
    # Clean and convert 'Installs' to a numeric type
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False).astype(int)
    
    # Clean and convert 'Size' to MB
    def convert_size(size):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        return None
    apps_df['Size'] = apps_df['Size'].apply(convert_size)
    apps_df.dropna(subset=['Size'], inplace=True)
    
    # Clean and convert 'Last Updated' to datetime
    apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'])

    # Apply Filters
    filtered_df = apps_df[
        (apps_df['Rating'] >= 4.0) &
        (apps_df['Size'] < 10) &
        (apps_df['Last Updated'].dt.month == 1)
    ].copy()

    if filtered_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_ratings_reviews_chart.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return

    # Get the top 10 app categories by number of installs
    top_categories = filtered_df.groupby('Category')['Installs'].sum().nlargest(10).index
    top_10_df = filtered_df[filtered_df['Category'].isin(top_categories)]
    
    # Group by category to get average rating and total review count
    grouped_df = top_10_df.groupby('Category').agg(
        Average_Rating=('Rating', 'mean'),
        Total_Reviews=('Reviews', 'sum')
    ).reset_index()

    # Create figure
    fig = go.Figure()

    # Add bar chart for Average Rating
    fig.add_trace(go.Bar(
        x=grouped_df['Category'],
        y=grouped_df['Average_Rating'],
        name='Average Rating',
        marker_color='#1f77b4'  # Blue color
    ))

    # Add bar chart for Total Reviews
    fig.add_trace(go.Bar(
        x=grouped_df['Category'],
        y=grouped_df['Total_Reviews'],
        name='Total Reviews',
        marker_color='#ff7f0e'  # Orange color
    ))
    
    # Update layout for grouped bar chart
    fig.update_layout(
        title_text='Average Rating and Total Reviews for Top 10 App Categories',
        xaxis_title='App Category',
        yaxis_title='Value',
        barmode='group',
        template='plotly_white',
        legend_title_text='Metric',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )

    # Generate the HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
    
    # Save the HTML to a file
    with open("app_ratings_reviews_chart.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_ratings_reviews_chart.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard not generated. It is not between 3 PM and 5 PM IST.


In [7]:
def is_time_to_show_graph():
    """Checks if the current time is between 4 PM and 6 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=16, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=18, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the stacked area chart and saves it to an HTML file."""
    
    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app visualization is only available between 4 PM and 6 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_installs_time_series.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 4 PM and 6 PM IST.")
        return

    # Get the path to the uploaded CSV file
    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return

    try:
        # Load the dataframe
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    if apps_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">The uploaded data is empty or invalid.</h1>
            <p>Please provide a valid Play Store data CSV file.</p>
        </div>
        """
        with open("app_installs_time_series.html", "w") as f:
            f.write(html_content)
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # --- Data Cleaning and Preprocessing ---
    
    # Drop rows with missing values in key columns
    apps_df.dropna(subset=['Rating', 'App', 'Category', 'Reviews', 'Size', 'Last Updated', 'Installs'], inplace=True)
    
    # Clean and convert 'Reviews' to numeric
    apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
    apps_df.dropna(subset=['Reviews'], inplace=True)
    
    # Clean and convert 'Installs' to numeric
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False).astype(int)
    
    # Clean and convert 'Size' to MB
    def convert_size(size):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        return None
    apps_df['Size'] = apps_df['Size'].apply(convert_size)
    apps_df.dropna(subset=['Size'], inplace=True)
    
    # Clean and convert 'Last Updated' to datetime
    apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'])

    # --- Apply Filters ---
    filtered_df = apps_df[
        (apps_df['Rating'] >= 4.2) &
        (apps_df['App'].apply(lambda x: not bool(re.search(r'\d', x)))) &
        (apps_df['Category'].isin(['TRAVEL_AND_LOCAL', 'PRODUCTIVITY', 'PHOTOGRAPHY'])) &
        (apps_df['Reviews'] > 1000) &
        (apps_df['Size'] >= 20) &
        (apps_df['Size'] <= 80)
    ].copy()

    if filtered_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_installs_time_series.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return

    # Aggregate data by category and month
    filtered_df['Month'] = filtered_df['Last Updated'].dt.to_period('M')
    grouped_df = filtered_df.groupby(['Category', 'Month']).agg(
        Total_Installs=('Installs', 'sum')
    ).reset_index()

    # Sort and calculate cumulative installs
    grouped_df.sort_values(by=['Category', 'Month'], inplace=True)
    grouped_df['Cumulative_Installs'] = grouped_df.groupby('Category')['Total_Installs'].cumsum()
    
    # Calculate month-over-month percentage change
    grouped_df['Monthly_Change_Pct'] = grouped_df.groupby('Category')['Total_Installs'].pct_change()

    # --- Legend Translation and Color Mapping ---
    category_map = {
        'TRAVEL_AND_LOCAL': 'Voyages et locaux',
        'PRODUCTIVITY': 'Productividad',
        'PHOTOGRAPHY': '写真'
    }

    base_colors = {
        'TRAVEL_AND_LOCAL': 'rgba(31, 119, 180, 0.7)',
        'PRODUCTIVITY': 'rgba(255, 127, 14, 0.7)',
        'PHOTOGRAPHY': 'rgba(44, 160, 44, 0.7)'
    }
    
    highlight_colors = {
        'TRAVEL_AND_LOCAL': 'rgb(21, 80, 122)',
        'PRODUCTIVITY': 'rgb(194, 97, 8)',
        'PHOTOGRAPHY': 'rgb(30, 110, 30)'
    }

    # --- Create Plotly Figure ---
    fig = go.Figure()

    for category in grouped_df['Category'].unique():
        category_df = grouped_df[grouped_df['Category'] == category]
        translated_name = category_map.get(category, category)

        # Base stacked area chart trace
        fig.add_trace(go.Scatter(
            x=category_df['Month'].astype(str),
            y=category_df['Cumulative_Installs'],
            name=translated_name,
            mode='lines',
            stackgroup='one',
            line=dict(width=0),
            fill='tozeroy',
            fillcolor=base_colors[category],
            hoverinfo='x+y+name'
        ))

        # Highlight trace for significant month-over-month growth
        highlight_df = category_df[category_df['Monthly_Change_Pct'] > 0.25]
        if not highlight_df.empty:
            fig.add_trace(go.Scatter(
                x=highlight_df['Month'].astype(str),
                y=highlight_df['Cumulative_Installs'],
                mode='lines+markers',
                showlegend=False,
                line=dict(color=highlight_colors[category], width=4),
                marker=dict(symbol='circle', size=8, color=highlight_colors[category]),
                hoverinfo='x+y',
                customdata=highlight_df[['Monthly_Change_Pct']],
                hovertemplate=
                    "Month: %{x}<br>" +
                    "Cumulative Installs: %{y:,}<br>" +
                    "Monthly Growth: %{customdata[0]:.2%}<br>" +
                    "<extra></extra>"
            ))

    # --- Update Layout ---
    fig.update_layout(
        title_text='Cumulative App Installs Over Time by Category',
        xaxis_title='Month',
        yaxis_title='Cumulative Installs',
        template='plotly_white',
        hovermode='x unified'
    )

    # Generate the HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
    
    # Save the HTML to a file
    with open("app_installs_time_series.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_installs_time_series.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard generated successfully as app_installs_time_series.html


In [6]:
def is_time_to_show_graph():
    """Checks if the current time is between 5 PM and 7 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=17, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=19, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the bubble chart and saves it to an HTML file."""
    
    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app visualization is only available between 5 PM and 7 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_metrics_bubble_chart.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 5 PM and 7 PM IST.")
        return

    # Get the path to the uploaded CSV file
    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return

    try:
        # Load the dataframe
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return

    if apps_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">The uploaded data is empty or invalid.</h1>
            <p>Please provide a valid Play Store data CSV file.</p>
        </div>
        """
        with open("app_metrics_bubble_chart.html", "w") as f:
            f.write(html_content)
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # --- Data Cleaning and Preprocessing ---
    
    # Drop rows with missing values in key columns
    apps_df.dropna(subset=['Rating', 'Reviews', 'Size', 'Installs', 'Category', 'App'], inplace=True)
    
    # Clean and convert 'Rating' to numeric
    apps_df['Rating'] = pd.to_numeric(apps_df['Rating'], errors='coerce')

    # Clean and convert 'Reviews' to numeric
    apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
    
    # Clean and convert 'Installs' to numeric
    # The error 'Free' occurs because of a malformed row in the CSV.
    # We'll use to_numeric with errors='coerce' to turn non-numeric values into NaN, then drop them.
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False)
    apps_df['Installs'] = pd.to_numeric(apps_df['Installs'], errors='coerce')
    apps_df.dropna(subset=['Installs'], inplace=True)
    apps_df['Installs'] = apps_df['Installs'].astype(int)
    
    # Clean and convert 'Size' to MB
    def convert_size(size):
        if 'M' in size:
            return float(size.replace('M', ''))
        elif 'k' in size:
            return float(size.replace('k', '')) / 1024
        return None
    apps_df['Size'] = apps_df['Size'].apply(convert_size)
    apps_df.dropna(subset=['Size'], inplace=True)

    # --- Apply Filters ---
    selected_categories = [
        'GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION',
        'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS'
    ]
    
    filtered_df = apps_df[
        (apps_df['Rating'] > 3.5) &
        (apps_df['Reviews'] > 500) &
        (apps_df['Installs'] > 50000) &
        (apps_df['App'].str.contains('s', case=False) == False) &
        (apps_df['Category'].isin(selected_categories))
    ].copy()
    
    # NOTE: The provided CSV file does not contain a 'Sentiment Subjectivity' column.
    # This filter has been omitted as the data is unavailable.

    if filtered_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_metrics_bubble_chart.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return

    # --- Legend Translation and Color Mapping ---
    category_map = {
        'BEAUTY': 'सौंदर्य',        # Hindi
        'BUSINESS': 'வணிகம்',      # Tamil
        'DATING': 'Verabredung'     # German
    }

    # Pre-defined colors for the categories
    category_colors = {
        'GAME': '#ff69b4',           # Pink for Game
        'BEAUTY': '#8a2be2',
        'BUSINESS': '#a52a2a',
        'COMICS': '#008080',
        'COMMUNICATION': '#4169e1',
        'DATING': '#d2691e',
        'ENTERTAINMENT': '#00ff7f',
        'SOCIAL': '#ffc0cb',
        'EVENTS': '#800080'
    }

    # Create figure
    fig = go.Figure()

    for category in filtered_df['Category'].unique():
        category_df = filtered_df[filtered_df['Category'] == category]
        translated_name = category_map.get(category, category)
        color = category_colors.get(category, '#cccccc') # Default color if not in map
        
        fig.add_trace(go.Scatter(
            x=category_df['Size'],
            y=category_df['Rating'],
            mode='markers',
            name=translated_name,
            marker=dict(
                color=color,
                size=category_df['Installs'] / 10000, # Scale bubble size
                sizemode='area',
                sizeref=2.*max(filtered_df['Installs'])/(100.**2),
                sizemin=4,
                opacity=0.6,
                line=dict(width=1, color='DarkSlateGrey')
            ),
            text=category_df['App'], # Hover text
            hoverinfo='text+x+y'
        ))

    # Update layout for the bubble chart
    fig.update_layout(
        title_text='App Size vs. Average Rating (Bubble Size = Installs)',
        xaxis_title='App Size (MB)',
        yaxis_title='Average Rating',
        template='plotly_white',
        legend_title_text='Category',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )

    # Generate the HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
    
    # Save the HTML to a file
    with open("app_metrics_bubble_chart.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_metrics_bubble_chart.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard generated successfully as app_metrics_bubble_chart.html


In [7]:
def is_time_to_show_graph():
    """Checks if the current time is between 6 PM and 8 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=18, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=20, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the bar chart and saves it to an HTML file."""

    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app visualization is only available between 6 PM and 8 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_installs_by_category.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 6 PM and 8 PM IST.")
        return

    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return

    try:
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return
        
    if apps_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">The uploaded data is empty or invalid.</h1>
            <p>Please provide a valid Play Store data CSV file.</p>
        </div>
        """
        with open("app_installs_by_category.html", "w") as f:
            f.write(html_content)
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # --- Data Cleaning and Preprocessing ---
    apps_df.dropna(subset=['Category', 'Installs'], inplace=True)
    
    # Clean and convert 'Installs' to numeric
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False)
    apps_df['Installs'] = pd.to_numeric(apps_df['Installs'], errors='coerce')
    apps_df.dropna(subset=['Installs'], inplace=True)
    apps_df['Installs'] = apps_df['Installs'].astype(int)

    # --- Apply Filters ---
    # Exclude categories starting with 'A', 'C', 'G', or 'S'
    excluded_prefixes = ('ART', 'COMMUNICATION', 'GAME', 'SOCIAL')
    filtered_df = apps_df[~apps_df['Category'].str.startswith(excluded_prefixes, na=False)].copy()

    # Get the top 5 categories by total installs
    category_installs = filtered_df.groupby('Category')['Installs'].sum().nlargest(5).reset_index()

    if category_installs.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_installs_by_category.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return

    # --- Create Plotly Bar Chart ---
    # Highlight categories with more than 1 million installs
    colors = ['#1f77b4'] * len(category_installs)
    for i, row in category_installs.iterrows():
        if row['Installs'] > 1000000:
            colors[i] = '#ff7f0e' # Highlight color

    fig = go.Figure(data=[go.Bar(
        x=category_installs['Category'],
        y=category_installs['Installs'],
        marker_color=colors,
        text=category_installs['Installs'],
        texttemplate='%{y:,s}',
        textposition='outside'
    )])

    fig.update_layout(
        title_text='Top 5 App Categories by Total Installs',
        xaxis_title='Category',
        yaxis_title='Total Installs',
        template='plotly_white',
        uniformtext_minsize=8, 
        uniformtext_mode='hide'
    )
    
    # Generate HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')
    
    # Save the HTML to a file
    with open("app_installs_by_category.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_installs_by_category.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard generated successfully as app_installs_by_category.html


In [9]:
def is_time_to_show_graph():
    """Checks if the current time is between 6 PM and 9 PM IST."""
    ist = pytz.timezone('Asia/Kolkata')
    now = datetime.now(ist)
    start_time = now.replace(hour=18, minute=0, second=0, microsecond=0)
    end_time = now.replace(hour=21, minute=0, second=0, microsecond=0)
    return start_time <= now < end_time

def generate_dashboard():
    """Generates the time series line chart and saves it to an HTML file."""
    
    # Check if it's the right time to generate the dashboard
    if not is_time_to_show_graph():
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: red;">The app visualization is only available between 6 PM and 9 PM IST.</h1>
            <p>Please check back during the specified time window.</p>
        </div>
        """
        with open("app_trend_chart.html", "w") as f:
            f.write(html_content)
        print("Dashboard not generated. It is not between 6 PM and 9 PM IST.")
        return

    file_path = "Play Store Data.csv"
    if not os.path.exists(file_path):
        print(f"Error: The file '{file_path}' was not found.")
        return
    
    try:
        apps_df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return
    
    if apps_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">The uploaded data is empty or invalid.</h1>
            <p>Please provide a valid Play Store data CSV file.</p>
        </div>
        """
        with open("app_trend_chart.html", "w") as f:
            f.write(html_content)
        print("Error: The dataframe is empty after loading the CSV file.")
        return

    # --- Data Cleaning and Preprocessing ---
    apps_df.dropna(subset=['App', 'Category', 'Reviews', 'Installs', 'Last Updated'], inplace=True)

    # Clean and convert 'Installs' to numeric
    apps_df['Installs'] = apps_df['Installs'].str.replace('+', '', regex=False).str.replace(',', '', regex=False)
    apps_df['Installs'] = pd.to_numeric(apps_df['Installs'], errors='coerce')
    apps_df.dropna(subset=['Installs'], inplace=True)
    apps_df['Installs'] = apps_df['Installs'].astype(int)

    # Clean and convert 'Reviews' to numeric
    apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
    apps_df.dropna(subset=['Reviews'], inplace=True)
    apps_df['Reviews'] = apps_df['Reviews'].astype(int)

    # Convert 'Last Updated' to datetime
    apps_df['Last Updated'] = pd.to_datetime(apps_df['Last Updated'])
    apps_df['Month'] = apps_df['Last Updated'].dt.to_period('M')

    # --- Apply Filters ---
    # App name should not start with 'x', 'y', 'z'
    apps_df = apps_df[~apps_df['App'].str.lower().str.startswith(('x', 'y', 'z'), na=False)]
    
    # App category should start with 'E', 'C', or 'B'
    filtered_df = apps_df[apps_df['Category'].str.startswith(('E', 'C', 'B'), na=False)].copy()
    
    # Reviews should be greater than 500
    filtered_df = filtered_df[filtered_df['Reviews'] > 500]

    # App name should not contain letter 'S'
    filtered_df = filtered_df[~filtered_df['App'].str.contains('s', case=False, na=False)]

    if filtered_df.empty:
        html_content = """
        <div style="text-align: center; margin-top: 50px; font-family: Arial, sans-serif;">
            <h1 style="color: blue;">No data found after applying all filters.</h1>
            <p>The applied filters were very strict. Try relaxing them to see some results.</p>
        </div>
        """
        with open("app_trend_chart.html", "w") as f:
            f.write(html_content)
        print("No data found after applying all filters. HTML file generated with a message.")
        return
        
    # Translate categories for the legend
    category_map = {
        'BEAUTY': 'सौंदर्य',        # Hindi
        'BUSINESS': 'வணிகம்'       # Tamil
    }

    # Group by category and month to get total installs
    monthly_installs = filtered_df.groupby(['Category', 'Month'])['Installs'].sum().reset_index()

    # Create figure
    fig = go.Figure()

    unique_categories = monthly_installs['Category'].unique()
    for category in unique_categories:
        category_df = monthly_installs[monthly_installs['Category'] == category].copy()
        # Reset the index to avoid KeyError
        category_df.reset_index(drop=True, inplace=True)

        category_df['Month'] = category_df['Month'].astype(str)
        
        # Calculate month-over-month growth
        category_df['Installs_prev'] = category_df['Installs'].shift(1)
        category_df['growth_rate'] = (category_df['Installs'] - category_df['Installs_prev']) / category_df['Installs_prev'] * 100
        
        # Translate category name for the legend
        translated_name = category_map.get(category, category)
        
        # Add the main line chart trace
        fig.add_trace(go.Scatter(
            x=category_df['Month'], 
            y=category_df['Installs'], 
            mode='lines',
            name=translated_name,
            line=dict(width=2)
        ))

        # Add shaded areas for > 20% month-over-month growth
        for i in range(1, len(category_df)):
            if category_df.loc[i, 'growth_rate'] > 20:
                fig.add_trace(go.Scatter(
                    x=[category_df.loc[i-1, 'Month'], category_df.loc[i, 'Month']],
                    y=[category_df.loc[i-1, 'Installs'], category_df.loc[i, 'Installs']],
                    mode='lines',
                    line=dict(width=0),
                    fill='tonexty',
                    fillcolor=f'rgba(0, 255, 0, 0.2)', # Shaded area color
                    showlegend=False
                ))

    # Update layout
    fig.update_layout(
        title_text='Cumulative App Installs Over Time by Category',
        xaxis_title='Date',
        yaxis_title='Total Installs',
        template='plotly_white',
        hovermode='x unified',
        legend_title_text='App Category'
    )

    # Generate the HTML content
    html_content = fig.to_html(full_html=True, include_plotlyjs='cdn')

    # Save the HTML to a file
    with open("app_trend_chart.html", "w") as f:
        f.write(html_content)
    
    print("Dashboard generated successfully as app_trend_chart.html")

if __name__ == "__main__":
    generate_dashboard()


Dashboard generated successfully as app_trend_chart.html
