In [3]:


import nbformat as nbf
from nbformat.v4 import new_notebook, new_markdown_cell, new_code_cell
import os, json

nb = new_notebook()
cells = []

# Title cell
title_md = """# Data Toolkit Assignment

cells.append(new_markdown_cell(title_md))



theory_qas = [
("What is NumPy, and why is it widely used in Python?",
"NumPy (Numerical Python) is a library providing efficient multi-dimensional arrays (ndarray) and numerical operations over them. It's widely used because it is fast (C-backed), supports vectorized operations, broadcasting, optimized linear algebra, and integrates well with scientific and machine-learning libraries."),
("How does broadcasting work in NumPy?",
"Broadcasting allows NumPy to perform arithmetic operations on arrays of different shapes by 'stretching' the smaller array along the missing dimensions so shapes become compatible. Rules: align trailing dimensions, dimensions must be equal or one of them is 1, or the dimension is absent."),
("What is a Pandas DataFrame?",
"A Pandas DataFrame is a 2D, size-mutable, tabular data structure with labeled axes (rows and columns). It stores heterogeneous data and provides rich functionality for reading, writing, cleaning, grouping, and analyzing data."),
("Explain the use of the groupby() method in Pandas",
"`groupby()` splits the DataFrame into groups based on column(s), then allows aggregation, transformation, or filtration on each group (e.g., `df.groupby('col').mean()` computes mean per group)."),
("Why is Seaborn preferred for statistical visualizations?",
"Seaborn is built on Matplotlib and provides a high-level API for attractive statistical plots with sensible defaults, built-in themes, and functions for visualizing distributions, categorical relationships, regression lines, and matrix plots (like heatmaps)."),
("What are the differences between NumPy arrays and Python lists?",
"NumPy arrays have fixed type (homogeneous), provide vectorized operations, are stored in contiguous memory (fast and memory-efficient), and support multi-dimensionality. Python lists are heterogeneous, stored as pointers to objects, slower for numeric computations, and lack vectorized ops."),
("What is a heatmap, and when should it be used?",
"A heatmap is a matrix-like plot that uses color to show values. Use it to visualize correlation matrices, confusion matrices, or any 2D table where color intensity helps identify patterns."),
("What does the term “vectorized operation” mean in NumPy?",
"Vectorized operations operate element-wise on entire arrays without explicit Python loops, delegating work to optimized C loops — this makes computations much faster and code shorter."),
("How does Matplotlib differ from Plotly?",
"Matplotlib is a static plotting library (although it supports interactive backends) focused on publication-quality figures; Plotly creates interactive, web-ready plots with zooming, tooltips, and easy sharing. Plotly often produces larger outputs but is more interactive."),
("What is the significance of hierarchical indexing in Pandas?",
"Hierarchical (MultiIndex) indexing allows multiple index levels on rows and/or columns enabling powerful reshaping, grouping, and slicing operations for higher-dimensional datasets stored in 2D structures."),
("What is the role of Seaborn’s pairplot() function?",
"`pairplot()` plots pairwise relationships in a dataset (scatterplots for pairs and histograms/ KDEs on the diagonal) — useful for quick exploratory data analysis to spot correlations and distributions."),
("What is the purpose of the describe() function in Pandas?",
"`describe()` provides summary statistics for numeric columns (count, mean, std, min, 25%, 50%, 75%, max). It can also summarize object columns when `include='all'` is used."),
("Why is handling missing data important in Pandas?",
"Missing data can bias analyses, cause errors in models, and produce incorrect aggregates. Pandas provides tools (`isna()`, `dropna()`, `fillna()`, interpolation) to detect and handle missing values appropriately."),
("What are the benefits of using Plotly for data visualization?",
"Plotly offers interactive plots with tooltips, zoom/pan, exporting, dashboards, and easy web embedding. It supports many chart types and interactivity without extra JavaScript coding."),
("How does NumPy handle multidimensional arrays?",
"NumPy uses the ndarray object with a `shape` tuple describing each axis length and a contiguous memory layout (row-major by default). It supports indexing, slicing, broadcasting, and linear algebra on these arrays."),
("What is the role of Bokeh in data visualization?",
"Bokeh is a Python library for interactive visualizations for modern web browsers. It can produce interactive plots, dashboards, and streaming data visualizations, with server-backed apps."),
("Explain the difference between apply() and map() in Pandas",
"`map()` is used on Series to map values element-wise (often with a dict or function). `apply()` can be used on Series or DataFrame; when on DataFrame it applies a function along an axis and can return transformed rows/columns."),
("What are some advanced features of NumPy",
"Advanced features include broadcasting, structured arrays, masked arrays, ufuncs (universal functions) with methods like `reduce`, linear algebra (linalg), FFTs, and random sampling."),
("How does Pandas simplify time series analysis",
"Pandas has specialized datetime types (`datetime64[ns]`), `DatetimeIndex`, resampling (`resample()`), rolling/window functions, time-zone handling, and convenient parsing of dates when reading files."),
("What is the role of a pivot table in Pandas",
"A pivot table reshapes data to aggregate values (like Excel pivot): `pivot_table()` groups data by index/columns and applies an aggregation function (mean, sum) to produce a summarized table."),
("Why is NumPy’s array slicing faster than Python’s list slicing",
"NumPy slicing returns a view (no copy) into contiguous memory and the operations are implemented in C. Python list slicing creates new Python objects and must copy references, so it's slower."),
("What are some common use cases for Seaborn?",
"Exploratory data analysis: distribution plots (hist, kde), categorical plots (boxplot, violin), relational plots (scatter, regplot), matrix plots (heatmap), and pairwise plots (pairplot).")
]

for q,a in theory_qas:
    cells.append(new_markdown_cell(f"**Q:** {q}\n\n**A:** {a}"))

# Practical section header
cells.append(new_markdown_cell("## Practical Tasks (runnable code)\nThe following code cells are ready to run in Colab. They demonstrate typical tasks requested in the assignment."))

# 1: Create a 2D NumPy array and calculate sum of each row
code1 = """# 1. Create a 2D NumPy array and calculate the sum of each row
import numpy as np

arr = np.array([[1,2,3],[4,5,6],[7,8,9]])
print("Array:\\n", arr)
row_sums = arr.sum(axis=1)
print("Sum of each row:", row_sums)"""
cells.append(new_code_cell(code1))

# 2: Pandas script to find mean of a specific column
code2 = """# 2. Pandas script to find the mean of a specific column in a DataFrame
import pandas as pd
df = pd.DataFrame({'A':[10,20,30,40], 'B':[1.5, 2.5, 3.5, 4.5]})
print("DataFrame:\\n", df)
mean_A = df['A'].mean()
print("Mean of column A:", mean_A)"""
cells.append(new_code_cell(code2))

# 3: Create a scatter plot using Matplotlib
code3 = """# 3. Create a scatter plot using Matplotlib
import matplotlib.pyplot as plt
x = [1,2,3,4,5]
y = [2,4,1,3,7]
plt.figure(figsize=(6,4))
plt.scatter(x,y)
plt.title('Scatter plot (Matplotlib)')
plt.xlabel('x')
plt.ylabel('y')
plt.grid(True)
plt.show()"""
cells.append(new_code_cell(code3))

# 4: Correlation matrix with Seaborn heatmap
code4 = """# 4. Calculate correlation matrix using Pandas and visualize with Seaborn heatmap
import seaborn as sns
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randn(100,4), columns=list('ABCD'))
corr = df.corr()
print("Correlation matrix:\\n", corr)
sns.heatmap(corr, annot=True, fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()"""
cells.append(new_code_cell(code4))

# 5: Generate a bar plot using Plotly
code5 = """# 5. Generate a bar plot using Plotly
import plotly.express as px
import pandas as pd
df = pd.DataFrame({'fruits':['apple','banana','orange'], 'count':[10,15,7]})
fig = px.bar(df, x='fruits', y='count', title='Fruit Counts')
fig.show()"""
cells.append(new_code_cell(code5))

# 6: Create a DataFrame and add new column based on existing column
code6 = """# 6. Create a DataFrame and add a new column based on an existing column
import pandas as pd
df = pd.DataFrame({'price':[100,200,150], 'qty':[1,2,3]})
df['total'] = df['price'] * df['qty']
print(df)"""
cells.append(new_code_cell(code6))

# 7: Element-wise multiplication of two NumPy arrays
code7 = """# 7. Element-wise multiplication of two NumPy arrays
import numpy as np
a = np.array([1,2,3])
b = np.array([4,5,6])
print("a * b =", a * b)"""
cells.append(new_code_cell(code7))

# 8: Line plot with multiple lines using Matplotlib
code8 = """# 8. Create a line plot with multiple lines using Matplotlib
import matplotlib.pyplot as plt
x = [1,2,3,4,5]
y1 = [1,4,9,16,25]
y2 = [2,3,5,7,11]
plt.figure(figsize=(7,4))
plt.plot(x,y1, label='y = x^2')
plt.plot(x,y2, label='example series')
plt.title('Multiple lines')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()"""
cells.append(new_code_cell(code8))

# 9: Filter rows where column value is greater than threshold
code9 = """# 9. Generate a Pandas DataFrame and filter rows where a column > threshold
import pandas as pd
df = pd.DataFrame({'name':['a','b','c','d'], 'score':[45,78,88,60]})
filtered = df[df['score'] > 60]
print("Filtered rows:\\n", filtered)"""
cells.append(new_code_cell(code9))

# 10: Histogram using Seaborn
code10 = """# 10. Create a histogram using Seaborn to visualize a distribution
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
data = np.random.normal(loc=0, scale=1, size=500)
sns.histplot(data, kde=True)
plt.title('Histogram with KDE')
plt.show()"""
cells.append(new_code_cell(code10))

# 11: Matrix multiplication using NumPy
code11 = """# 11. Perform matrix multiplication using NumPy
import numpy as np
A = np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8]])
C = A.dot(B)  # or np.matmul(A,B)
print("A.dot(B) =\\n", C)"""
cells.append(new_code_cell(code11))

# 12: Use Pandas to load a CSV and display first 5 rows (example using created CSV)
code12 = """# 12. Use Pandas to load a CSV file and display first 5 rows
import pandas as pd
# Creating an example CSV to demonstrate reading (in Colab normally you'd upload or mount Drive)
csv_path = 'example_data.csv'
pd.DataFrame({'A':[1,2,3,4,5], 'B':[10,20,30,40,50]}).to_csv(csv_path, index=False)
df = pd.read_csv(csv_path)
print(df.head())"""
cells.append(new_code_cell(code12))

# 13: Create a 3D scatter plot using Plotly
code13 = """# 13. Create a 3D scatter plot using Plotly
import plotly.express as px
import pandas as pd
import numpy as np
df = pd.DataFrame({'x':np.random.randn(100), 'y':np.random.randn(100), 'z':np.random.randn(100)})
fig = px.scatter_3d(df, x='x', y='y', z='z', title='3D Scatter (Plotly)')
fig.show()"""
cells.append(new_code_cell(code13))





SyntaxError: unterminated triple-quoted string literal (detected at line 205) (ipython-input-806791228.py, line 201)