In [1]:
import pandas as pd
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# Ensure the necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/c.ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/c.ryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the CSV file
file_path = 'data/female-winners.csv'
data = pd.read_csv(file_path)

In [4]:
data.head()

Unnamed: 0,year,gender,name,href,category,motivation
0,2009,female,Ada E. Yonath,https://api.nobelprize.org/2/laureate/843,Chemistry,for studies of the structure and function of t...
1,2013,female,Alice Munro,https://api.nobelprize.org/2/laureate/892,Literature,master of the contemporary short story
2,1982,female,Alva Myrdal,https://api.nobelprize.org/2/laureate/543,Peace,for their work for disarmament and nuclear and...
3,2020,female,Andrea Ghez,https://api.nobelprize.org/2/laureate/990,Physics,for the discovery of a supermassive compact ob...
4,2023,female,Anne L’Huillier,https://api.nobelprize.org/2/laureate/1028,Physics,for experimental methods that generate attosec...


In [5]:
# Function to clean and tokenize text
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [6]:
# Apply preprocessing to the motivation column
data['cleaned_motivation'] = data['motivation'].apply(preprocess_text)

In [7]:
data.head()

Unnamed: 0,year,gender,name,href,category,motivation,cleaned_motivation
0,2009,female,Ada E. Yonath,https://api.nobelprize.org/2/laureate/843,Chemistry,for studies of the structure and function of t...,"[studies, structure, function, ribosome]"
1,2013,female,Alice Munro,https://api.nobelprize.org/2/laureate/892,Literature,master of the contemporary short story,"[master, contemporary, short, story]"
2,1982,female,Alva Myrdal,https://api.nobelprize.org/2/laureate/543,Peace,for their work for disarmament and nuclear and...,"[work, disarmament, nuclear, weaponfree, zones]"
3,2020,female,Andrea Ghez,https://api.nobelprize.org/2/laureate/990,Physics,for the discovery of a supermassive compact ob...,"[discovery, supermassive, compact, object, cen..."
4,2023,female,Anne L’Huillier,https://api.nobelprize.org/2/laureate/1028,Physics,for experimental methods that generate attosec...,"[experimental, methods, generate, attosecond, ..."


In [8]:
data[:1]['cleaned_motivation'][0][0]

'studies'

In [9]:
# Combine all tokens into a single list
all_tokens = [token for sublist in data['cleaned_motivation'] for token in sublist]

In [10]:
# Get the frequency distribution of the words
word_freq = Counter(all_tokens)

In [11]:
word_freq.most_common(10)

[('rights', 8),
 ('discoveries', 8),
 ('work', 7),
 ('discovery', 7),
 ('struggle', 7),
 ('development', 7),
 ('human', 6),
 ('efforts', 5),
 ('women', 5),
 ('peace', 5)]