In [1]:
import collections
from collections import Counter
from collections import defaultdict
import pandas as pd
import numpy as np
import torch
import time 
import random
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
import scipy.stats as stats
from scipy.stats import entropy, spearmanr
from scipy.stats import wasserstein_distance
from scipy.spatial.distance import jensenshannon, cosine
import regex as re
import spacy
import string
import os
import cProfile
import pstats
import io
import json
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.options.mode.chained_assignment = None
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
nlp = spacy.load("en_core_web_lg")
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device_id = 0 if torch.backends.mps.is_available() else -1

In [4]:
dm = pd.read_csv("gecko_pelts.csv", index_col = 0)
energy_drink = [x for x in dm['word']]
da = pd.read_csv("jan_mar_2023poldumps.csv", index_col = 0)
dg = da.dropna().drop_duplicates(subset="comment")

In [5]:
model_name = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
fill_mask_pipeline = pipeline("fill-mask", model=model_name, device=device_id)

In [6]:
def clean_prose(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"^>\s*", "", text)  # Remove leading '>'
    text = re.sub(r">>\d+\s*", "", text)  # Remove quote references
    text = re.sub(r"^[^a-zA-Z]+|[^a-zA-Z.!?]+$", "", text)  # Trim unwanted characters
    text = re.sub(r"\n+", " ", text)  # Replace newlines with spaces
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    text = re.sub(r"http[s]?://[^\s>]+|www\.[^\s>]+", "", text)  # Remove URLs
    return text

def split_sentences(text):
    if not isinstance(text, str):
        return []
    return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]

def filter_by_word(df, column, word):
    if not isinstance(word, str):
        raise ValueError(f"Invalid word input: expected string, got {type(word)}")
    
    pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
    return df[df[column].astype(str).str.contains(pattern, na=False)]

def filter_strings_by_word(sentences, word):
    if not isinstance(word, str):
        raise ValueError(f"Invalid word input: expected string, got {type(word)}")
    
    pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
    return [s for s in sentences if pattern.search(s)]

def is_valid_word(word):
    return bool(re.match(r"^[a-zA-Z]+$", word))

def clean_dataframe(df):
    df.drop_duplicates(subset="comment", keep="first", inplace=True)
    df = df[df["comment"].apply(lambda x: len(str(x).split(" ")) <= 512)]
    return df.reset_index(drop=True) if len(df) >= 100 else pd.DataFrame()

def og_thedf(df, word):
    df["comment"] = df["comment"].apply(clean_prose)
    df = filter_by_word(df, "comment", word)
    df["com_list_it1"] = df["comment"].apply(split_sentences)
    df["sent_with_word"] = df["com_list_it1"].apply(lambda x: filter_strings_by_word(x, word))
    df = df.explode("sent_with_word").drop_duplicates(subset=["comment"]).reset_index(drop=True)
    return df

def getopdf(df, word):
    return og_thedf(df, word)[lambda x: x["op"] == 1]

def group_threads_by_timestamp(df, thread_col="thread_num", num_col="num", comment_col="comment", timestamp_col="timestamp"):
    thread_dict = defaultdict(list)
    for thread, thread_df in df.groupby(thread_col):
        sorted_comments = thread_df.sort_values(by=timestamp_col)[[num_col, comment_col]].to_dict(orient='records')
        thread_dict[thread] = sorted_comments
    return thread_dict

def df_to_thread_dict(df):
    thread_dict = {
        thread: thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
        for thread, thread_df in df.groupby("thread_num")
    }
    return thread_dict

def precompute_thread_dict(df):
    thread_dict = {}
    for thread, thread_df in df.groupby("thread_num"):
        thread_dict[thread] = thread_df.sort_values(by="timestamp")[["num", "comment"]].to_dict(orient='records')
    return thread_dict

def pretokenize_comments(thread_dict):
    for thread, posts in thread_dict.items():
        for post in posts:
            post['tokenized_comment'] = set(post['comment'].lower().split())  # Store pre-tokenized words
    return thread_dict

karina = pretokenize_comments(precompute_thread_dict(dg)) ### TURNING DATAFRAME INTO A DICTIONARY OF THREADS!!! 

In [7]:
def count_valid_threads(thread_dict, words):
    valid_thread_counts = {}
    for word in words:
        if not isinstance(word, str):
            print(f"Skipping invalid word: {word} (not a string)")
            continue
        word_pattern = re.compile(rf'\b{re.escape(word)}\b', re.IGNORECASE)
        valid_threads = 0

        for thread_id, comments in thread_dict.items():
            # Find all comments containing the target word
            appearances = (c for c in comments if word_pattern.search(c['comment']))
            appearances_list = list(appearances)

            # Count only threads where the word appears at least twice
            if len(appearances_list) > 1:
                valid_threads += 1

        if valid_threads >= 100:  # Only keep words that have at least 100 valid threads
            valid_thread_counts[word] = valid_threads
    return valid_thread_counts

def filter_and_extract_word_stats(thread_dict, target_word):
    if not isinstance(target_word, str):
        raise ValueError(f"Invalid target_word input: expected string, got {type(target_word)}")
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    data = []
    for thread_id, comments in thread_dict.items():
        # Use a generator to find comments containing the target word (faster than list comprehensions)
        appearances = (c for c in comments if word_pattern.search(c['comment']))
        appearances_list = list(appearances)
        if len(appearances_list) > 1:
            first_comment, last_comment = appearances_list[0], appearances_list[-1]

            data.append({
                'thread_id': thread_id,
                'first_appearance': first_comment['comment'],
                'first_id': first_comment['num'],
                'last_appearance': last_comment['comment'],
                'last_id': last_comment['num'],
                'length': len(comments)  # Total number of comments in thread
            })
    return pd.DataFrame(data)

In [8]:
def mask_df(a_df, a_word, tokenizer = tokenizer):
    if not isinstance(a_word, str):
        raise ValueError(f"Invalid a_word input: expected string, got {type(a_word)}")
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(a_word)}\b', re.IGNORECASE)
    
    def masker(text):
        return word_pattern.sub(mask_token, text) if isinstance(text, str) else text
    
    def trim_long_text(text, limit=450):
        if not isinstance(text, str):
            return text
        tokens = tokenizer.tokenize(text)
        if len(tokens) <= limit:
            return text
        words = text.split()
        word_indices = [i for i, w in enumerate(words) if word_pattern.search(w)]
        if not word_indices:
            return " ".join(words[:limit])
        
        target_index = word_indices[0]
        left, right = max(0, target_index - limit//2), min(len(words), target_index + limit//2)
        return " ".join(words[left:right])
    
    a_df["first_appearance"] = a_df["first_appearance"].apply(masker)
    a_df["last_appearance"] = a_df["last_appearance"].apply(masker)
    a_df = a_df[a_df["first_appearance"].str.contains(mask_token, na=False) | a_df["last_appearance"].str.contains(mask_token, na=False)]
    a_df["first_appearance"] = a_df["first_appearance"].apply(trim_long_text)
    a_df["last_appearance"] = a_df["last_appearance"].apply(trim_long_text)
    return a_df

In [9]:
def get_random_samples(df, trial_num):
    dfs = []
    for i in range(1, trial_num + 1):
        dfs.append(df.sample(n=20, random_state=i))
    return dfs

In [10]:
def accumulate_meaning_unrecognized(df, mask_column, target_word, fill_mask_pipeline=fill_mask_pipeline, tokenizer=tokenizer, top_k=10):
    predictions_list = []
    mask_token = tokenizer.mask_token
    word_pattern = re.compile(rf'\b{re.escape(target_word)}\b', re.IGNORECASE)
    punctuation_set = set(string.punctuation)
    df[mask_column] = df[mask_column].astype(str)
    sentences = df[mask_column].str.strip().tolist()
    processed_sentences = [word_pattern.sub(mask_token, sentence.lower()) for sentence in sentences if sentence]
    # Skip sentences longer than 512 tokens and print a warning
    filtered_sentences = []
    for sentence in processed_sentences:
        if len(tokenizer.tokenize(sentence)) > 512:
            print(f"Skipping sentence (exceeds 512 tokens): {sentence[:100]}...")
            continue
        filtered_sentences.append(sentence)
    
    if not filtered_sentences:
        return predictions_list  # Return empty if no valid sentences remain

    # Batch tokenization with GPU support
    tokenized_sentences = tokenizer.batch_encode_plus(filtered_sentences, padding=True, truncation=True, return_tensors="pt")
    tokenized_sentences = {k: v.to(device) for k, v in tokenized_sentences.items()}  # Move to GPU if available

    # **Batch Inference Optimization with torch.no_grad()**
    with torch.no_grad():
        predictions_batch = fill_mask_pipeline(filtered_sentences)  # **Send all at once instead of looping**
    
    for predictions in predictions_batch:
        if not isinstance(predictions, list) or 'token_str' not in predictions[0]:
            continue

        filtered_predictions = {
            p['token_str'].strip(): np.log(p['score'])
            for p in predictions
            if 'token_str' in p and not all(char in punctuation_set for char in p['token_str'])
        }

        if not filtered_predictions:
            continue
        log_probs = np.array(list(filtered_predictions.values()))
        max_log_prob = np.max(log_probs)
        exp_probs = np.exp(log_probs - max_log_prob)
        normalized_probs = dict(zip(filtered_predictions.keys(), exp_probs / exp_probs.sum()))
        predictions_list.append(normalized_probs)
    return predictions_list

In [11]:
def get_pos(word):
    doc = nlp(word)
    return doc[0].pos_ if doc and doc[0].pos_ else "UNKNOWN"

def get_first_guess(dict_list):
    guess_counts = Counter()
    for d in dict_list:
        if d:
            best_guess = max(d, key=d.get)  # Get the word with the highest probability
            guess_counts[best_guess] += 1
    return guess_counts.most_common(1)[0][0] if guess_counts else None

def cosine_similarity(p, q):
    all_keys = set(p.keys()).union(set(q.keys()))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])
    
    if np.all(p_vec == 0) or np.all(q_vec == 0):  # Avoid division by zero
        return 0.0
    return 1 - cosine(p_vec, q_vec)

def wasserstein_similarity(p, q):
    all_keys = list(set(p.keys()).union(set(q.keys())))
    p_vec = np.array([p.get(k, 0) for k in all_keys])
    q_vec = np.array([q.get(k, 0) for k in all_keys])

    return wasserstein_distance(p_vec, q_vec)

def kl_divergence(p, q):
    p = np.array(list(p.values())) + 1e-10
    q = np.array(list(q.values())) + 1e-10
    return stats.entropy(p, q)

def js_divergence(p, q):
    p = np.array(list(p.values())) + 1e-10
    q = np.array(list(q.values())) + 1e-10
    m = 0.5 * (p + q)
    return 0.5 * (stats.entropy(p, m) + stats.entropy(q, m))

def entropy(p):
    p = np.array(list(p.values())) + 1e-10
    return stats.entropy(p)

def rank_stability(dict_list):
    ranks = [{k: rank for rank, (k, _) in enumerate(sorted(d.items(), key=lambda x: -x[1]))} for d in dict_list]
    diffs = [sum(abs(ranks[i][k] - ranks[i-1].get(k, len(ranks[i]))) for k in ranks[i]) for i in range(1, len(ranks))]
    return np.mean(diffs)

def jaccard_similarity(p, q, k=5):
    top_p = set(sorted(p, key=p.get, reverse=True)[:k])
    top_q = set(sorted(q, key=q.get, reverse=True)[:k])
    return len(top_p & top_q) / len(top_p | top_q)

def most_common_pos(dict_list):
    pos_counts = Counter()
    for d in dict_list:
        for word in d.keys():
            pos_counts[get_pos(word)] += 1
    return pos_counts.most_common(1)[0][0]

def semantic_drift(first_dist, last_dist):
    return 1 - cosine_similarity(first_dist, last_dist)  # Higher = more meaning change

def word_surprise(first_dist, last_dist):
    return kl_divergence(first_dist, last_dist)

def top_k_jaccard(first_dist, last_dist, k=5):
    return jaccard_similarity(first_dist, last_dist, k)

def pos_stability(first_dict, last_dict):
    first_pos = get_pos(get_first_guess([first_dict]))  # Get POS of most probable first guess
    last_pos = get_pos(get_first_guess([last_dict]))  # Get POS of most probable last guess
    return 1 if first_pos == last_pos else 0  # 1 = Same POS, 0 = Different POS

def entropy_change(first_dist, last_dist):
    return abs(entropy(first_dist) - entropy(last_dist))  # Absolute difference in uncertainty

def compute_metrics(first_dict, last_dict):
    if not first_dict or not last_dict:
        return None  

    metrics = {
        'Cosine_Similarity': cosine_similarity(first_dict, last_dict),
        'Wasserstein_Distance': wasserstein_similarity(first_dict, last_dict),
        'Entropy_First': entropy(first_dict),
        'Entropy_Last': entropy(last_dict),
        'Entropy_Change': entropy_change(first_dict, last_dict),
        'Rank_Stability': rank_stability([first_dict, last_dict]),
        'Temporal_Jaccard': jaccard_similarity(first_dict, last_dict),
        'Semantic_Drift': semantic_drift(first_dict, last_dict),
        'Word_Surprise': word_surprise(first_dict, last_dict),
        'Top_K_Jaccard': top_k_jaccard(first_dict, last_dict),
        'POS_Stability': pos_stability(first_dict, last_dict),
        'Most_Common_POS_First': get_pos(get_first_guess([first_dict])),
        'Most_Common_POS_Last': get_pos(get_first_guess([last_dict])),
        'First_Guess': get_first_guess([first_dict]),
        'Last_Guess': get_first_guess([last_dict])
    }
    return metrics


In [12]:
def aggregate_metrics(trial_metrics_list):
    # Remove empty or None trials
    trial_metrics_list = [trial for trial in trial_metrics_list if trial]

    if not trial_metrics_list:  # If all trials were empty, return an empty dict
        print("Warning: No valid trials found for aggregation.")
        return {}

    aggregated = {}

    # Updated numerical metrics (includes new metrics)
    numerical_keys = [
        'Cosine_Similarity', 'Wasserstein_Distance', 'Entropy_First', 'Entropy_Last', 
        'Entropy_Change', 'Rank_Stability', 'Temporal_Jaccard', 
        'Semantic_Drift', 'Word_Surprise', 'Top_K_Jaccard', 'POS_Stability'
    ]

    for key in numerical_keys:
        try:
            values = [trial[key] for trial in trial_metrics_list if key in trial and trial[key] is not None]

            if values:  # Ensure non-empty list to avoid errors
                mean_val = np.mean(values)
                std_val = np.std(values, ddof=1) if len(values) > 1 else 0  # Avoid division by zero
                ci_95 = (
                    mean_val - 1.96 * (std_val / np.sqrt(len(values))) if len(values) > 1 else mean_val,
                    mean_val + 1.96 * (std_val / np.sqrt(len(values))) if len(values) > 1 else mean_val
                )

                aggregated[key] = {
                    'Mean': mean_val,
                    'STD': std_val,
                    'CI_95': ci_95
                }
        except Exception as e:
            print(f"Skipping {key} due to missing data: {e}")

    # Aggregate categorical variables (POS and First/Last Guess)
    try:
        pos_first_counts = Counter(trial['Most_Common_POS_First'] for trial in trial_metrics_list if 'Most_Common_POS_First' in trial and trial['Most_Common_POS_First'])
        pos_last_counts = Counter(trial['Most_Common_POS_Last'] for trial in trial_metrics_list if 'Most_Common_POS_Last' in trial and trial['Most_Common_POS_Last'])

        aggregated['Most_Common_POS_First'] = pos_first_counts.most_common(1)[0][0] if pos_first_counts else None
        aggregated['Most_Common_POS_Last'] = pos_last_counts.most_common(1)[0][0] if pos_last_counts else None
    except Exception as e:
        print(f"Skipping POS aggregation due to missing data: {e}")

    try:
        first_guess_counts = Counter(trial['First_Guess'] for trial in trial_metrics_list if 'First_Guess' in trial and trial['First_Guess'])
        last_guess_counts = Counter(trial['Last_Guess'] for trial in trial_metrics_list if 'Last_Guess' in trial and trial['Last_Guess'])

        aggregated['First_Guess'] = first_guess_counts.most_common(1)[0][0] if first_guess_counts else None
        aggregated['Last_Guess'] = last_guess_counts.most_common(1)[0][0] if last_guess_counts else None
    except Exception as e:
        print(f"Skipping First/Last Guess aggregation due to missing data: {e}")

    return aggregated


In [13]:
def process_word_trials(words, dataframe, n):
    results = []
    total_words = len(words)
    start_time = time.time()

    for idx, word in enumerate(words, 1):
        word_start_time = time.time()
        print(f"Processing word {idx}/{total_words}: {word}")

        # Step 1: Extract and filter data
        try:
            covid = filter_and_extract_word_stats(dataframe, word)
            maskcovid = mask_df(covid, word)
            the_df = get_random_samples(maskcovid, n)

            if the_df.empty:
                print(f"Skipping {word}: No valid samples found.")
                continue

        except Exception as e:
            print(f"Error processing {word}: {e}")
            continue

        # Step 2: Compute meaning accumulation for first and last appearances
        try:
            first_appearance_dict_list = [accumulate_meaning_unrecognized(trial_df, 'first_appearance', word) for trial_df in the_df]
            last_appearance_dict_list = [accumulate_meaning_unrecognized(trial_df, 'last_appearance', word) for trial_df in the_df]

            # Remove any None or empty dictionaries from results
            first_appearance_dict_list = [d for d in first_appearance_dict_list if d]
            last_appearance_dict_list = [d for d in last_appearance_dict_list if d]

            if not first_appearance_dict_list or not last_appearance_dict_list:
                print(f"Skipping {word}: No valid meaning accumulation results.")
                continue

        except Exception as e:
            print(f"Error accumulating meaning for {word}: {e}")
            continue

        # Step 3: Compute Metrics
        try:
            first_metrics = [compute_metrics(first_dict, last_dict) for first_dict, last_dict in zip(first_appearance_dict_list, last_appearance_dict_list)]
            aggregated_metrics = aggregate_metrics(first_metrics)

        except Exception as e:
            print(f"Error computing metrics for {word}: {e}")
            continue

        # Step 4: Store Aggregated Results
        row = {'Word': word}
        for metric, values in aggregated_metrics.items():
            if isinstance(values, dict):
                row.update({
                    f'{metric}_Mean': values['Mean'],
                    f'{metric}_STD': values['STD'],
                    f'{metric}_CI': values['CI_95']
                })
            else:
                row[metric] = values

        results.append(row)

        # Step 5: Time tracking and progress update
        elapsed_time = time.time() - word_start_time
        remaining_time = ((time.time() - start_time) / idx) * (total_words - idx)
        print(f"Completed {word} in {elapsed_time:.2f} seconds. Estimated time remaining: {remaining_time:.2f} seconds.")

    return pd.DataFrame(results)


In [14]:
len(energy_drink) #1077
# get words with at least 100 threads where their first and last appearance are different 
valid_words_dict = count_valid_threads(karina, energy_drink)  # {word: valid_thread_count}
valid_words = list(valid_words_dict.keys())

KeyboardInterrupt: 

In [109]:
len(energy_drink) #1077
sublists = [energy_drink[i:i + 100] for i in range(0, len(energy_drink), 100)]

In [112]:
def process_and_save_sublists(the_sublists, dataframe, n_trials, folder_name="first_round"):
    os.makedirs(folder_name, exist_ok=True)  # Create the folder if it doesn't exist
    
    for idx, sublist in enumerate(the_sublists, 1):
        print(f"Processing sublist {idx}/10...")
        df = process_word_trials(sublist, dataframe, n_trials)
        file_path = os.path.join(folder_name, f"first_round_sublist_{idx}.csv")
        df.to_csv(file_path, index=False)
        print(f"Saved {file_path}")
    
    print("All sublists processed and saved.")

In [113]:
#started feb 18 10:28PM 
process_and_save_sublists(sublists, karina, 5)

Processing sublist 1/10...
Processing word 1/100: khazaria
Completed khazaria in 12.24 seconds. Estimated time remaining: 1211.52 seconds.
Processing word 2/100: christkike
Completed christkike in 11.78 seconds. Estimated time remaining: 1176.84 seconds.
Processing word 3/100: soiboi


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
#YOU'RE IN THE DEBUGGING SECTION NOW -----
covid = filter_and_extract_word_stats(karina, "covid") # Get a dataframe of the input word, and its first and last appearances
maskcovid = mask_df(covid, 'covid') # Mask the input word
the_df = get_random_samples(maskcovid, 5)
first_appearance_dict_list = [accumulate_meaning_unrecognized(trial_df, 'first_appearance', "covid") for trial_df in the_df] # Get the probability distributions
n = "whatever trial number we're on"
word = "some word"
covid = filter_and_extract_word_stats(karina, "covid") # Get a dataframe of the input word, and its first and last appearances
maskcovid = mask_df(covid, 'covid') # Mask the input word
the_df = get_random_samples(maskcovid, 5)
first_appearance_dict_list = accumulate_meaning_unrecognized(the_df, 'first_appearance', "covid") # Get the probability distributions
last_appearance_dict_list = accumulate_meaning_unrecognized(the_df, 'last_appearance', "some word")
first = compute_metrics(first_appearance_dict_list) # Get the metrics for each trial 
last = compute_metrics(last_appearance_dict_list)
first = aggregate_metrics(first)
last = aggregate_metrics(last)
#add this to a dataframe where each row is the input word (assume we're passing in a list of words)
#each column in the dataframe is each first/last appearance's respective metrics, and respective mean/STD socres 

test_list = ['covid', 'russia', 'china']

def profile_code():
    words = ["covid", "russia", "china"]  # Test with a few words first
    dataframe = karina  # Your input DataFrame
    n = 5  # Number of trials per word

    profiler = cProfile.Profile()
    profiler.enable()
    process_word_trials(words, dataframe, n)  # Replace with your function call
    profiler.disable()

    s = io.StringIO()
    sortby = 'cumulative'  # Sort by total time spent in each function
    ps = pstats.Stats(profiler, stream=s).sort_stats(sortby)
    ps.print_stats(20)  # Show top 20 slowest functions
    print(s.getvalue())

# Run the profiling
profile_code()

Processing word 1/3: covid
Completed covid in 13.72 seconds. Estimated time remaining: 27.44 seconds.
Processing word 2/3: russia
Completed russia in 17.07 seconds. Estimated time remaining: 15.39 seconds.
Processing word 3/3: china
Completed china in 15.45 seconds. Estimated time remaining: 0.00 seconds.
         51607975 function calls (50628379 primitive calls) in 46.233 seconds

   Ordered by: cumulative time
   List reduced from 1055 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.005    0.005   46.236   46.236 /var/folders/87/9qwg67f164gck_jk0qq9fkc40000gn/T/ipykernel_37158/2236755408.py:1(process_word_trials)
       30    0.013    0.000   19.351    0.645 /var/folders/87/9qwg67f164gck_jk0qq9fkc40000gn/T/ipykernel_37158/3099757713.py:1(accumulate_meaning_unrecognized)
       30    0.000    0.000   19.223    0.641 /Users/easy/.pyenv/versions/3.11.5/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py: