In [None]:
# Example 1: Handling Networks

# Task: Create and analyze a directed graph based on conversation data.
import pandas as pd
import networkx as nx

# Sample DataFrame (conversation data)
data = {
    "id": ["u1", "u2", "u3", "u4", "u5"],
    "speaker": ["Ross", "Rachel", "Ross", "Monica", "Chandler"],
    "reply_to": [None, "u1", "u2", "u3", "u4"],
    "season": ["s01", "s01", "s01", "s01", "s01"],
    "episode": ["e01", "e01", "e01", "e01", "e01"]
}
df = pd.DataFrame(data)

# Create a MultiDiGraph
G = nx.MultiDiGraph()

# Add nodes (speakers) and edges (replies)
for _, row in df.iterrows():
    G.add_node(row["speaker"])  # Add speaker as a node
    if row["reply_to"]:  # Add edges based on replies
        reply_speaker = df.loc[df["id"] == row["reply_to"], "speaker"].values[0]
        G.add_edge(row["speaker"], reply_speaker, season=row["season"], episode=row["episode"])

# Print graph properties
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Nodes:", G.nodes())
print("Edges:", G.edges(data=True))

# Alternative representation: Weighted graph
# Create a DiGraph with edge weights (number of replies between nodes)
weighted_G = nx.DiGraph()
for u, v, data in G.edges(data=True):
    if weighted_G.has_edge(u, v):
        weighted_G[u][v]["weight"] += 1
    else:
        weighted_G.add_edge(u, v, weight=1)

print("\nWeighted Graph Edges:", weighted_G.edges(data=True))

# Example 2: Handling Text

# Task: Process text data, generate a word-frequency matrix, and compute TF-IDF.
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Sample DataFrame (text data)
data = {
    "id": ["u1", "u2", "u3"],
    "speaker": ["Chandler", "Ross", "Chandler"],
    "tokens": [
        ["Could", "I", "be", "more", "sarcastic"],
        ["We", "were", "on", "a", "break"],
        ["Oh", "my", "God", "could", "I", "be", "more", "excited"]
    ],
    "episode": ["s01_e01", "s01_e01", "s01_e01"]
}
df = pd.DataFrame(data)

# Flatten tokens for Chandler
chandler_tokens = df[df["speaker"] == "Chandler"]["tokens"].sum()

# Step A: Create a sorted list of unique tokens
unique_tokens = sorted(set(chandler_tokens))
print("Unique Tokens:", unique_tokens[:5])

# Step B: Create a word-frequency matrix
episodes = df["episode"].unique()
word_matrix = np.zeros((len(episodes), len(unique_tokens)))

for i, ep in enumerate(episodes):
    tokens = df[df["episode"] == ep & (df["speaker"] == "Chandler")]["tokens"].sum()
    for token in tokens:
        if token in unique_tokens:
            word_matrix[i][unique_tokens.index(token)] += 1

print("\nWord Matrix Shape:", word_matrix.shape)
print("Word Matrix (Chandler):", word_matrix)

# Step C: Convert to TF-IDF
vectorizer = TfidfVectorizer(vocabulary=unique_tokens, lowercase=False, tokenizer=lambda x: x, preprocessor=lambda x: x)
tfidf_matrix = vectorizer.fit_transform([' '.join(tokens) for tokens in df['tokens']])
print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)

# Example 3: Descriptive Statistics and Data Summarization

# Task: Summarize a dataset of TV show episodes and explore character interactions.
import pandas as pd

# Sample DataFrame
data = {
    "character": ["Ross", "Rachel", "Monica", "Chandler", "Phoebe", "Joey"],
    "lines": [1200, 1100, 950, 1050, 870, 1000],
    "scenes": [80, 75, 60, 70, 50, 65]
}
df = pd.DataFrame(data)

# Calculate basic statistics
print("Summary Statistics:")
print(df.describe())

# Add a new column: Average lines per scene
df["lines_per_scene"] = df["lines"] / df["scenes"]
print("\nLines Per Scene:")
print(df)

# Identify the character with the most lines
most_lines = df.loc[df["lines"].idxmax(), "character"]
print(f"\nCharacter with the most lines: {most_lines}")

# Visualize lines per character
import matplotlib.pyplot as plt

plt.bar(df["character"], df["lines"])
plt.title("Lines Per Character")
plt.xlabel("Character")
plt.ylabel("Lines")
plt.show()

# Example 4: Advanced Graph Analysis

# Task: Analyze character connections in a show using network centrality metrics.
import networkx as nx

# Create a directed graph with sample connections
G = nx.DiGraph()

edges = [
    ("Ross", "Rachel"), ("Rachel", "Ross"),
    ("Chandler", "Joey"), ("Joey", "Chandler"),
    ("Monica", "Phoebe"), ("Phoebe", "Monica"),
    ("Ross", "Chandler"), ("Rachel", "Monica")
]

G.add_edges_from(edges)

# Calculate centrality measures
print("Degree Centrality:", nx.degree_centrality(G))
print("Betweenness Centrality:", nx.betweenness_centrality(G))
print("Closeness Centrality:", nx.closeness_centrality(G))

# Visualize the graph
import matplotlib.pyplot as plt

pos = nx.spring_layout(G)  # Layout for visualization
nx.draw(G, pos, with_labels=True, node_size=1500, node_color="lightblue", arrowsize=20)
plt.title("Character Interaction Graph")
plt.show()

# Example 5: Regression Analysis for Length of Utterances

# Task: Investigate the relationship between seasons and utterance lengths.
import statsmodels.api as sm
import pandas as pd

# Sample DataFrame
data = {
    "season": [1, 2, 3, 4, 5],
    "avg_length": [50, 55, 60, 58, 65]
}
df = pd.DataFrame(data)

# Add a constant for regression
X = sm.add_constant(df["season"])
y = df["avg_length"]

# Fit a linear regression model
model = sm.OLS(y, X).fit()

# Display the summary
print(model.summary())

# Interpret the coefficients
intercept = model.params["const"]
slope = model.params["season"]
print(f"Intercept: {intercept}, Slope: {slope}")

# Example 6: Text Sentiment Analysis

# Task: Perform sentiment analysis on a dataset of dialogue lines.
import pandas as pd
from textblob import TextBlob

# Sample DataFrame of dialogue
data = {
    "character": ["Ross", "Rachel", "Chandler", "Monica"],
    "dialogue": [
        "We were on a break!",
        "It's like all my life everyone has always told me, 'You're a shoe!'",
        "Could I BE wearing any more clothes?",
        "Welcome to the real world. It sucks. You’re gonna love it!"
    ]
}
df = pd.DataFrame(data)

# Analyze sentiment
df["polarity"] = df["dialogue"].apply(lambda x: TextBlob(x).sentiment.polarity)
df["subjectivity"] = df["dialogue"].apply(lambda x: TextBlob(x).sentiment.subjectivity)

print(df)

# Visualize sentiment
import seaborn as sns

sns.scatterplot(data=df, x="polarity", y="subjectivity", hue="character")
plt.title("Sentiment Analysis of Character Dialogues")
plt.show()

# Example 7: Unsupervised Learning with Clustering

# Task: Group characters based on their interaction statistics.
from sklearn.cluster import KMeans
import numpy as np

# Sample DataFrame of character interactions
data = {
    "character": ["Ross", "Rachel", "Monica", "Chandler", "Phoebe", "Joey"],
    "lines_spoken": [1200, 1100, 950, 1050, 870, 1000],
    "interactions": [300, 280, 260, 270, 250, 290]
}
df = pd.DataFrame(data)

# Prepare data for clustering
X = df[["lines_spoken", "interactions"]]

# Apply KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
df["cluster"] = kmeans.fit_predict(X)

print(df)

# Visualize clusters
sns.scatterplot(data=df, x="lines_spoken", y="interactions", hue="cluster", style="character", s=100)
plt.title("Character Clustering")
plt.show()

# Tricks and Examples for Pandas DataFrame Manipulation

# Example 8: Basic DataFrame Operations

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, 30, 35, 40],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"]
}
df = pd.DataFrame(data)

# Select specific columns
print("Selecting Columns:")
print(df["Name"])

# Filtering rows
print("\nFiltering Rows:")
print(df[df["Age"] > 30])

# Adding a new column
df["Age in Months"] = df["Age"] * 12
print("\nNew Column:")
print(df)

# Renaming columns
df.rename(columns={"City": "Location"}, inplace=True)
print("\nRenamed Columns:")
print(df)

# Example 9: Grouping and Aggregations

data = {
    "Team": ["A", "B", "A", "B"],
    "Score": [10, 15, 20, 25],
    "Player": ["Alice", "Bob", "Charlie", "David"]
}
df = pd.DataFrame(data)

# Group by team and calculate sum of scores
grouped = df.groupby("Team")["Score"].sum()
print("\nGrouped Data:")
print(grouped)

# Example 10: Merging DataFrames

left = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]})
right = pd.DataFrame({"ID": [1, 2, 4], "City": ["New York", "Los Angeles", "Chicago"]})

# Inner join
merged = pd.merge(left, right, on="ID", how="inner")
print("\nMerged DataFrame:")
print(merged)


# Tricks and Examples for Pandas DataFrame Manipulation

import pandas as pd

# Example 1: Basic DataFrame Operations

# Sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [25, 30, 35, 40],
    "City": ["New York", "Los Angeles", "Chicago", "Houston"]
}
df = pd.DataFrame(data)

# Select specific columns
print("Selecting Columns:")
print(df["Name"])

# Filtering rows
print("\nFiltering Rows:")
print(df[df["Age"] > 30])

# Adding a new column
df["Age in Months"] = df["Age"] * 12
print("\nNew Column:")
print(df)

# Renaming columns
df.rename(columns={"City": "Location"}, inplace=True)
print("\nRenamed Columns:")
print(df)

# Example 2: Grouping and Aggregations

data = {
    "Team": ["A", "B", "A", "B"],
    "Score": [10, 15, 20, 25],
    "Player": ["Alice", "Bob", "Charlie", "David"]
}
df = pd.DataFrame(data)

# Group by team and calculate sum of scores
grouped = df.groupby("Team")["Score"].sum()
print("\nGrouped Data:")
print(grouped)

# Example 3: Merging DataFrames

left = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]})
right = pd.DataFrame({"ID": [1, 2, 4], "City": ["New York", "Los Angeles", "Chicago"]})

# Inner join
merged = pd.merge(left, right, on="ID", how="inner")
print("\nMerged DataFrame:")
print(merged)

# Example 4: Pivot Tables

data = {
    "Date": ["2023-01-01", "2023-01-02", "2023-01-01", "2023-01-02"],
    "Category": ["A", "A", "B", "B"],
    "Value": [10, 20, 30, 40]
}
df = pd.DataFrame(data)

pivot = df.pivot_table(index="Date", columns="Category", values="Value", aggfunc="sum")
print("\nPivot Table:")
print(pivot)

# Example 5: Handling Missing Data

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, None, 35],
    "City": ["New York", "Los Angeles", None]
}
df = pd.DataFrame(data)

# Fill missing values
df.fillna({"Age": df["Age"].mean(), "City": "Unknown"}, inplace=True)
print("\nFilled Missing Values:")
print(df)

# Drop rows with missing values
dropped_df = df.dropna()
print("\nDropped Rows with Missing Values:")
print(dropped_df)

# Example 6: Sorting Data

data = {
    "Name": ["Charlie", "Alice", "Bob"],
    "Age": [35, 25, 30],
    "Score": [90, 80, 85]
}
df = pd.DataFrame(data)

# Sort by a single column
df.sort_values(by="Age", inplace=True)
print("\nSorted by Age:")
print(df)

# Sort by multiple columns
df.sort_values(by=["Score", "Age"], ascending=[False, True], inplace=True)
print("\nSorted by Score and Age:")
print(df)

# Example 7: Apply and Lambda Functions

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Score": [85, 90, 88]
}
df = pd.DataFrame(data)

# Apply a custom function
df["Grade"] = df["Score"].apply(lambda x: "A" if x >= 90 else "B")
print("\nApplied Custom Function:")
print(df)

# Example 8: String Operations

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Email": ["alice@example.com", "bob@example.com", "charlie@example.com"]
}
df = pd.DataFrame(data)

# Extract domain from email
df["Domain"] = df["Email"].str.split("@").str[1]
print("\nExtracted Domain:")
print(df)

# Example 9: Working with Dates

data = {
    "Event": ["Meeting", "Conference", "Webinar"],
    "Date": ["2023-01-01", "2023-02-15", "2023-03-10"]
}
df = pd.DataFrame(data)

# Convert to datetime
df["Date"] = pd.to_datetime(df["Date"])
print("\nConverted to Datetime:")
print(df)

# Calculate days until the event
df["Days Until"] = (df["Date"] - pd.Timestamp("2023-01-01")).dt.days
print("\nDays Until Event:")
print(df)

# Example 10: Reshaping Data

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Subject": ["Math", "Science", "English"],
    "Score": [85, 90, 88]
}
df = pd.DataFrame(data)

# Melt the DataFrame
melted = pd.melt(df, id_vars="Name", value_vars=["Subject", "Score"], var_name="Attribute", value_name="Value")
print("\nMelted DataFrame:")
print(melted)

# Example 11: MultiIndex DataFrames

data = {
    "Region": ["North", "North", "South", "South"],
    "Product": ["A", "B", "A", "B"],
    "Sales": [100, 150, 200, 250]
}
df = pd.DataFrame(data)

# Set MultiIndex
df.set_index(["Region", "Product"], inplace=True)
print("\nMultiIndex DataFrame:")
print(df)

# Access data in MultiIndex
data_north = df.loc["North"]
print("\nData for North Region:")
print(data_north)

# Example 12: Combining DataFrames

# Concatenate DataFrames
df1 = pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]})
df2 = pd.DataFrame({"Name": ["Charlie", "David"], "Age": [35, 40]})
combined = pd.concat([df1, df2], ignore_index=True)
print("\nConcatenated DataFrame:")
print(combined)

# Example 13: Sampling Data

data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Score": [85, 90, 88, 92]
}
df = pd.DataFrame(data)

# Random sample of rows
sampled = df.sample(n=2, random_state=42)
print("\nRandom Sample of Rows:")
print(sampled)

# Example 14: Exploding Lists into Rows

data = {
    "Name": ["Alice", "Bob"],
    "Hobbies": [["Reading", "Cycling"], ["Swimming", "Hiking"]]
}
df = pd.DataFrame(data)

# Explode the lists into rows
df_exploded = df.explode("Hobbies")
print("\nExploded DataFrame:")
print(df_exploded)


In [None]:
# matching

# Examples of Matching by Propensity Score or Features in Common

import pandas as pd
import numpy as np
import networkx as nx

# Sample DataFrame
np.random.seed(42)
data = {
    "ID": range(1, 21),
    "Applied_Treatment": np.random.choice([0, 1], size=20, p=[0.5, 0.5]),
    "Feature1": np.random.normal(0, 1, 20),
    "Feature2": np.random.normal(5, 2, 20),
    "Propensity_Score": np.random.uniform(0, 1, 20)
}
data_df = pd.DataFrame(data)

# Separate into treatment and control groups
treatment_df = data_df[data_df["Applied_Treatment"] == 1]
control_df = data_df[data_df["Applied_Treatment"] == 0]

print(f"There are {treatment_df.shape[0]} samples in the treated group.")
print(f"There are {control_df.shape[0]} samples in the control group.")

# Function to calculate similarity based on Euclidean distance of features
def get_feature_similarity(row1, row2, feature_columns):
    return 1 / (1 + np.linalg.norm(row1[feature_columns] - row2[feature_columns]))

# Matching using Propensity Score
G = nx.Graph()

for treatment_id, treatment_row in treatment_df.iterrows():
    for control_id, control_row in control_df.iterrows():
        similarity = 1 - np.abs(treatment_row["Propensity_Score"] - control_row["Propensity_Score"])
        if similarity > 0.8:  # Adjust threshold as needed
            G.add_weighted_edges_from([(treatment_row["ID"], control_row["ID"], similarity)])
        
matching = nx.max_weight_matching(G, maxcardinality=True)

print("\nMatching by Propensity Score:")
print(matching)

# Matching using Features (Feature1 and Feature2)
feature_columns = ["Feature1", "Feature2"]
G = nx.Graph()

for treatment_id, treatment_row in treatment_df.iterrows():
    for control_id, control_row in control_df.iterrows():
        similarity = get_feature_similarity(treatment_row, control_row, feature_columns)
        if similarity > 0.7:  # Adjust threshold as needed
            G.add_weighted_edges_from([(treatment_row["ID"], control_row["ID"], similarity)])

feature_matching = nx.max_weight_matching(G, maxcardinality=True)

print("\nMatching by Features:")
print(feature_matching)

# Additional Matching Methods

# 1. Exact Matching on Features (e.g., categorical variables)
def exact_match(df1, df2, column):
    matches = []
    for _, row1 in df1.iterrows():
        for _, row2 in df2.iterrows():
            if row1[column] == row2[column]:
                matches.append((row1["ID"], row2["ID"]))
    return matches

# Create a new column for exact matching demonstration
data_df["Group"] = np.random.choice(["A", "B"], size=20)

treatment_df = data_df[data_df["Applied_Treatment"] == 1]
control_df = data_df[data_df["Applied_Treatment"] == 0]

exact_matches = exact_match(treatment_df, control_df, "Group")
print("\nExact Matches:")
print(exact_matches)

# 2. Nearest Neighbor Matching
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=1, metric="euclidean")
X_treatment = treatment_df[feature_columns].values
X_control = control_df[feature_columns].values

nn.fit(X_control)
_, indices = nn.kneighbors(X_treatment)

nearest_neighbor_matches = [(treatment_df.iloc[i]["ID"], control_df.iloc[idx]["ID"]) for i, idx in enumerate(indices.flatten())]

print("\nNearest Neighbor Matches:")
print(nearest_neighbor_matches)

# 3. Propensity Score Stratification
bins = np.linspace(0, 1, 5)
data_df["Propensity_Bin"] = pd.cut(data_df["Propensity_Score"], bins=bins)

stratified_groups = data_df.groupby(["Propensity_Bin", "Applied_Treatment"])
print("\nStratified Groups:")
for name, group in stratified_groups:
    print(f"Group: {name}")
    print(group)