<a href="https://colab.research.google.com/github/chela-lavin/My-list/blob/main/Jupyter_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
# Assuming the data is in a CSV file named 'data.csv'
try:
    df = pd.read_csv('data.csv')  # Load the CSV file
except FileNotFoundError:
    print("Error: 'data.csv' not found. Please make sure the file is in the same directory or provide the correct path.")
    #  Create a dummy dataframe to allow the rest of the code to run without erroring.
    df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7,8,9]})
    #  Exit if the file doesn't exist
    # exit()

# Display the first few rows of the DataFrame
print("First 5 rows of the data:")
print(df.head())

# Get a concise summary of the DataFrame, including data types and non-null values
print("\nData Summary:")
df.info()

# Get descriptive statistics of the numerical columns
print("\nDescriptive Statistics:")
print(df.describe())

# --- Basic Data Analysis ---

# 1. Selecting a single column and displaying the first 5 values
try:
    first_column = df.iloc[:, 0]  # Select the first column by index
    print(f"\nFirst column (first 5 values):\n{first_column.head()}")
except IndexError:
    print("\nError: No columns found in the DataFrame.")

# 2. Selecting two columns and displaying the first 5 rows
try:
    if len(df.columns) > 1:
        two_columns = df.iloc[:, [0, 1]]  # Select the first two columns
        print(f"\nFirst two columns (first 5 rows):\n{two_columns.head()}")
    else:
        print("\nError: DataFrame has only one column.  Cannot select two columns.")
except IndexError:
    print("\nError: Not enough columns in DataFrame to perform this selection.")

# 3. Filtering rows where a specific column's value is greater than a threshold
try:
    if 'numeric_column' in df.columns: # replace 'numeric_column' with a relevant column name
        filtered_df = df[df['numeric_column'] > 5]  # Example threshold: 5
        print("\nFiltered data (where 'numeric_column' > 5):")
        print(filtered_df.head())
    else:
        print("\nError: 'numeric_column' not found. Skipping filtering.")
except KeyError:
    print("\nError: 'numeric_column' not found. Skipping filtering.")

# 4. Grouping data by a categorical column and calculating the mean of a numerical column
try:
    if 'categorical_column' in df.columns and 'numeric_column' in df.columns:
        grouped_data = df.groupby('categorical_column')['numeric_column'].mean()
        print("\nGrouped data (mean of 'numeric_column' by 'categorical_column'):")
        print(grouped_data)
    else:
        print("\nError: 'categorical_column' or 'numeric_column' not found. Skipping grouping.")
except KeyError:
    print("\nError: One of the required columns does not exist.")

# 5. Counting the unique values in a column
try:
    if 'another_column' in df.columns:
        unique_value_counts = df['another_column'].value_counts()
        print("\nUnique value counts for 'another_column':")
        print(unique_value_counts)
    else:
        print("\nError: 'another_column' not found. Skipping unique value counts.")
except KeyError:
    print("\nError: The column 'another_column' does not exist.")

# --- Data Visualization ---
# 1. Histogram of a numerical column
try:
    if 'numeric_column' in df.columns:
        plt.hist(df['numeric_column'].dropna())  # Drop NaN values before plotting
        plt.title('Histogram of Numerical Column')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
        plt.show()
    else:
        print("\nError: 'numeric_column' not found. Skipping histogram.")
except KeyError:
    print("\nError: The column 'numeric_column' does not exist.")

# 2. Scatter plot of two numerical columns
try:
    if 'numeric_column_1' in df.columns and 'numeric_column_2' in df.columns:
        plt.scatter(df['numeric_column_1'], df['numeric_column_2'])
        plt.title('Scatter Plot')
        plt.xlabel('Numerical Column 1')
        plt.ylabel('Numerical Column 2')
        plt.show()
    else:
        print("\nError: 'numeric_column_1' or 'numeric_column_2' not found. Skipping scatter plot.")
except KeyError:
    print("\nError: One of the columns for the scatter plot does not exist.")

# 3. Bar chart of categorical data
try:
    if 'categorical_column' in df.columns:
        categorical_counts = df['categorical_column'].value_counts()
        plt.bar(categorical_counts.index, categorical_counts.values)
        plt.title('Bar Chart of Categorical Column')
        plt.xlabel('Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
        plt.tight_layout()  # Adjust layout
        plt.show()
    else:
        print("\nError: 'categorical_column' not found. Skipping bar chart.")
except KeyError:
    print("\nError: The column 'categorical_column' does not exist.")

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)