## List the 20 most similar word neighbors in each year for each concept

In [17]:
import json
import csv
from gensim.models import Word2Vec

model_folder = '/Users/kawaiyuen/nlpworkshop/concept-creep-chi_raw/models/'
concepts_file = '/Users/kawaiyuen/nlpworkshop/concept-creep-chi/0_data/wordlist/concepts.json'
output_csv = '/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-year.csv'

with open(concepts_file, 'r') as file:
    concepts = json.load(file)

# Create an empty dictionary to store the results
results = {}

# Iterate over each year
for year in range(1979, 2024):

    # Create an empty list to store the word neighbors for each concept
    year_results = []

    # Load the word2vec model
    model = Word2Vec.load(model_folder + f'pd_{year}.model')
    
    # Iterate over each target concept
    for key, value in concepts.items():
        concept = value[0]

        # Check if the concept is present in the word2vec model's vocabulary
        if concept in model.wv.key_to_index:
            # Find the 10 most similar word neighbors for the concept
            neighbors = model.wv.most_similar(concept, topn=50)
            
            # Extract just the words from the word neighbors
            words = [word for word, _ in neighbors]
            
            # Append the list of words to the year_results list
            year_results.append(words)
        else:
            # If the concept is not present, append an empty list to year_results
            year_results.append([])
    
    # Add the year_results list to the results dictionary with the year as the key
    results[str(year)] = year_results
    print(f'{year} done')

with open(output_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the column headers
    writer.writerow(['Year'] + list(concepts.keys()))
    
    # Write the data rows
    for year, year_results in results.items():
        writer.writerow([year] + year_results)

1979 done
1980 done
1981 done
1982 done
1983 done
1984 done
1985 done
1986 done
1987 done
1988 done
1989 done
1990 done
1991 done
1992 done
1993 done
1994 done
1995 done
1996 done
1997 done
1998 done
1999 done
2000 done
2001 done
2002 done
2003 done
2004 done
2005 done
2006 done
2007 done
2008 done
2009 done
2010 done
2011 done
2012 done
2013 done
2014 done
2015 done
2016 done
2017 done
2018 done
2019 done
2020 done
2021 done
2022 done
2023 done


## List the most similar word neighbors in each five-year timespan for each concept

### Concatenate lists in each timespan

In [18]:
import csv

# Define the time ranges
time_ranges = ['1979-1983', '1984-1988', '1989-1993', '1994-1998', '1999-2003', '2004-2008', '2009-2013', '2014-2018', '2019-2023']

# Create a dictionary to store the concatenated word neighbors for each target concept
concatenated_neighbors = {}

# Read the CSV file
with open('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-year.csv', 'r') as file:
    reader = csv.DictReader(file)
    
    # Iterate over each row in the CSV file
    for row in reader:
        year = int(row['Year'])
        
        # Check which time range the year falls into
        for i, time_range in enumerate(time_ranges):
            start_year = int(time_range.split('-')[0])
            end_year = int(time_range.split('-')[1])
            
            # If the year falls within the current time range
            if start_year <= year <= end_year:
                
                # Iterate over each target concept
                for concept in row.keys():
                    if concept != 'Year':
                        neighbors = eval(row[concept])  # Convert the string representation of list to a list
                        if concept not in concatenated_neighbors:
                            concatenated_neighbors[concept] = [''] * len(time_ranges)
                        concatenated_neighbors[concept][i] += ' '.join(neighbors) + ' '

# Write the results to a new CSV file
with open('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-five-year.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header row
    header_row = ['timespan'] + list(concatenated_neighbors.keys())
    writer.writerow(header_row)
    
    # Write the data rows
    for i, time_range in enumerate(time_ranges):
        data_row = [time_range] + [concatenated_neighbors[concept][i] for concept in concatenated_neighbors.keys()]
        writer.writerow(data_row)

### Filter by at least 3 occurrence in each timespan

In [19]:
import pandas as pd

# Read the CSV file
data = pd.read_csv('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-five-year.csv')

# Iterate over each cell in the dataframe
for index, row in data.iterrows():
    for column in data.columns[1:]:
        # Split the cell into individual words
        words = row[column].split()
        
        # Count the frequency of each word
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
        
        # Filter and retain words that appear at least three times
        filtered_words = [word for word, count in word_counts.items() if count >= 3]
        
        # Join the filtered words back into a single string
        # filtered_cell = ' '.join(filtered_words)
        
        # Update the cell in the dataframe
        data.at[index, column] = filtered_words

# Save the updated dataframe to a new CSV file
data.to_csv('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-five-year.csv', index=False)

### Concatenate lists in timespans to one giant list

In [24]:
import csv

# Create a dictionary to store the concatenated word neighbors for each target concept
concatenated_neighbors = {}

# Read the CSV file
with open('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/by-five-year.csv', 'r') as file:
    reader = csv.DictReader(file)
    
    # Iterate over each row in the CSV file
    for row in reader:
        
        # Iterate over each target concept (excluding the first column)
        for concept, neighbors in list(row.items())[1:]:
            neighbors = eval(neighbors)  # Convert the string representation of list to a list
            
            if concept not in concatenated_neighbors:
                concatenated_neighbors[concept] = []
            
            if isinstance(neighbors, list):
                concatenated_neighbors[concept].extend(neighbors)
            else:
                concatenated_neighbors[concept].append(neighbors)

# Write the results to a new CSV file
with open('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/one-list.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(concatenated_neighbors.keys())
    
    # Write the data rows
    writer.writerow(concatenated_neighbors.values())

## Construct final word neighbors lists

In [30]:
import pandas as pd
import ast

# Read the CSV file
data = pd.read_csv('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/one-list.csv')

# Iterate over each cell in the dataframe
for index, row in data.iterrows():
    for column in data.columns[0:]:
        # Split the cell into individual words
        words = ast.literal_eval(row[column])

        # Count the frequency of each word
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1

        # Filter and retain words that appear at least three times
        filtered_words = [word for word, count in word_counts.items() if count >= 2]
        
        # Remove unnecessary characters and clean the words
        cleaned_words = [filtered_word.strip().strip('\'"') for filtered_word in filtered_words]
        
        # Update the cell in the dataframe
        data.at[index, column] = cleaned_words

# Save the updated dataframe to a new CSV file
data.to_csv('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/final-list.csv', index=False)

### CSV to JSON

In [35]:
import pandas as pd
import json
import ast

# Read the CSV file
data = pd.read_csv('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/2_pipeline/preprocessed/quali/final-list.csv')

# Get the headers (target concepts)
headers = list(data.columns)

# Create an empty dictionary to store the JSON data
json_data = {}

# Iterate over each header
for header in headers:
    # Get the word neighbors for the current header
    neighbors = data[header].tolist()
    
    # Remove None values from the list of neighbors
    neighbors = [neighbor for neighbor in neighbors if pd.notnull(neighbor)]
    
    # Clean the structure of the value for the current key
    cleaned_neighbors = []
    for neighbor in neighbors:
        cleaned_neighbor = ast.literal_eval(neighbor)
        cleaned_neighbors.extend(cleaned_neighbor)
    
    # Update the dictionary with the target concept and its cleaned neighbors
    json_data[header] = cleaned_neighbors

# Convert the dictionary to JSON with specified encoding
json_output = json.dumps(json_data, ensure_ascii=False, indent=4)

with open('/Users/kawaiyuen/nlpworkshop/concept-creep-chi/0_data/wordlist/concepts_neighbors.json', 'w', encoding='utf-8') as file:
    file.write(json_output)