In [10]:
import os
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

#extracted a portion of the function from the Week 3 LSA topic modeling code.
#a function reads all '.txt' files from a folder, opens each file and adds its content to the 'documents' list.
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/red'
document_texts,document_labels = load_text_documents(folder_path)

#define the get_polarity function to calculate the sentiment polarity score,as Ref: https://towardsdatascience.com/my-absolute-go-to-for-sentiment-analysis-textblob-3ac3a11d524
def get_polarity(content):
    return TextBlob(content).sentiment.polarity

#calculate the TF-IDF values for each word in the document set document_texts.
vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

#convert the TF-IDF vectors into a pandas DataFrame for further analysis.
tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

#red_target_words
target_words = [
    "person", "expression", "rome", "emotional", "social", "theory", "roman", "war", "anger", "emotion",
    "goal", "psychologist", "research", "chariot", "performance", "ferrari", "psychology", "race", "athlete", "sport",
    "area", "system", "classic", "power", "high", "period", "chinese", "maya", "laser", "china",
    "vermilion", "national", "surface", "association", "player", "planet", "scarlet", "earth", "football", "mar",
    "body", "star", "thermodynamic", "culture", "dwarf", "temperature", "system", "transfer", "energy", "heat"
]

#loop through vocab to find the target word, convert vocab to a list for easy retrieval of the corresponding word's index, then use the index to obtain its TF-IDF value.
for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        #initialize a variable.
        sentiment_score = 0

    #calculate the sentiment score for target words
    for document, content in enumerate(document_texts):
        if word in content:
            #calculate the sentiment polarity of the document.
            document_sentiment = get_polarity(content)
            #extract the word's TF-IDF value from matrix 
            word_tfidf = tf_idf[document, word_index]
            #multiply the sentiment score of a single document by the word's TF-IDF value and accumulate these values to get the final score 
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'person': -0.00
Polarity sentiment score for 'expression': -0.01
Polarity sentiment score for 'rome': 0.01
Polarity sentiment score for 'emotional': 0.01
Polarity sentiment score for 'social': 0.02
Polarity sentiment score for 'theory': 0.02
Polarity sentiment score for 'roman': 0.00
Polarity sentiment score for 'war': 0.04
Polarity sentiment score for 'anger': -0.11
Polarity sentiment score for 'emotion': 0.02
Polarity sentiment score for 'goal': 0.01
Polarity sentiment score for 'psychologist': 0.01
Polarity sentiment score for 'research': 0.02
Polarity sentiment score for 'chariot': 0.03
Polarity sentiment score for 'performance': 0.01
Polarity sentiment score for 'ferrari': 0.00
Polarity sentiment score for 'psychology': 0.04
Polarity sentiment score for 'race': 0.02
Polarity sentiment score for 'athlete': 0.01
Polarity sentiment score for 'sport': 0.04
Polarity sentiment score for 'area': 0.01
Polarity sentiment score for 'system': 0.02
Polarity sentim

Calculate the Polarity sentiment score for nine other color-related keywords using the same program.

In [14]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/orange'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
    'show', 'theory', 'cup', 'pigment', 'produce', 'study', 'different', 'pumpkin', 'netherland', 'carrot', 'write', 'hallow', 'know', 'period', 'kingdom', 'rgb', 'ancient', 'egypt', 'halloween', 'egyptian', 'rock', 'area', 'autumn', 'arche', 'record', 'data', 'park', 'aircraft', 'flight', 'recorder', 'subcontinent', 'period', 'kingdom', 'dynasty', 'rule', 'british', 'spice', 'empire', 'indian', 'india', 'fly', 'flown', 'united', 'design', 'signal', 'pole', 'pigment', 'flagpole', 'national', 'orpiment'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'show': 0.01
Polarity sentiment score for 'theory': 0.02
Polarity sentiment score for 'cup': 0.02
Polarity sentiment score for 'pigment': 0.01
Polarity sentiment score for 'produce': 0.02
Polarity sentiment score for 'study': 0.02
Polarity sentiment score for 'different': 0.03
Polarity sentiment score for 'pumpkin': 0.03
Polarity sentiment score for 'netherland': 0.03
Polarity sentiment score for 'carrot': 0.03
Polarity sentiment score for 'write': 0.00
Polarity sentiment score for 'hallow': 0.00
Polarity sentiment score for 'know': 0.00
Polarity sentiment score for 'period': 0.01
Polarity sentiment score for 'kingdom': 0.01
Polarity sentiment score for 'rgb': 0.00
Polarity sentiment score for 'ancient': 0.02
Polarity sentiment score for 'egypt': 0.00
Polarity sentiment score for 'halloween': 0.02
Polarity sentiment score for 'egyptian': 0.00
Polarity sentiment score for 'rock': 0.01
Polarity sentiment score for 'area': 0.01
Polarity sentiment score for 'au

In [15]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/yellow'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
   'object', 'earth', 'yolk', 'sun', 'sunlight', 'egg', 'wave', 'solar', 'theory', 'radiation', 'three', 'united', 'know', 'primarie', 'rgb', 'plant', 'corn', 'pigment', 'ochre', 'maize', 'specie', 'hotel', 'average', 'water', 'nevada', 'casino', 'downtown', 'area', 'city', 'vega', 'high', 'country', 'period', 'empire', 'pleasure', 'egyptian', 'chinese', 'egypt', 'china', 'gold', 'point', 'taxicab', 'classification', 'jersey', 'france', 'race', 'stage', 'rider', 'taxi', 'tour'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'object': 0.01
Polarity sentiment score for 'earth': 0.01
Polarity sentiment score for 'yolk': 0.04
Polarity sentiment score for 'sun': 0.02
Polarity sentiment score for 'sunlight': 0.02
Polarity sentiment score for 'egg': 0.04
Polarity sentiment score for 'wave': 0.02
Polarity sentiment score for 'solar': 0.03
Polarity sentiment score for 'theory': 0.04
Polarity sentiment score for 'radiation': 0.05
Polarity sentiment score for 'three': 0.02
Polarity sentiment score for 'united': 0.00
Polarity sentiment score for 'know': 0.00
Polarity sentiment score for 'primarie': 0.00
Polarity sentiment score for 'rgb': 0.00
Polarity sentiment score for 'plant': 0.01
Polarity sentiment score for 'corn': 0.01
Polarity sentiment score for 'pigment': 0.02
Polarity sentiment score for 'ochre': 0.05
Polarity sentiment score for 'maize': 0.04
Polarity sentiment score for 'specie': 0.08
Polarity sentiment score for 'hotel': 0.01
Polarity sentiment score for 'average': 0.01
Pol

In [16]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/green'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
   'meaning', 'emotion', 'painting', 'goal', 'modern', 'historian', 'object', 'artist', 'hope', 'envy', 'animal', 'nature', 'water', 'natural', 'human', 'organism', 'cell', 'earth', 'emerald', 'life', 'disease', 'babylonian', 'technique', 'mesopotamia', 'waza', 'youth', 'age', 'casino', 'judo', 'health', 'modern', 'word', 'three', 'asia', 'pigment', 'system', 'rgb', 'europe', 'empire', 'theory', 'different', 'carbon', 'photosynthesi', 'reaction', 'chlorophyll', 'signal', 'traffic', 'plant', 'emotion'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'meaning': 0.02
Polarity sentiment score for 'emotion': 0.04
Polarity sentiment score for 'painting': 0.02
Polarity sentiment score for 'goal': 0.01
Polarity sentiment score for 'modern': 0.03
Polarity sentiment score for 'historian': 0.01
Polarity sentiment score for 'object': 0.01
Polarity sentiment score for 'artist': 0.01
Polarity sentiment score for 'hope': 0.08
Polarity sentiment score for 'envy': 0.05
Polarity sentiment score for 'animal': 0.00
Polarity sentiment score for 'nature': 0.01
Polarity sentiment score for 'water': 0.02
Polarity sentiment score for 'natural': 0.01
Polarity sentiment score for 'human': 0.01
Polarity sentiment score for 'organism': 0.00
Polarity sentiment score for 'cell': 0.02
Polarity sentiment score for 'earth': 0.02
Polarity sentiment score for 'emerald': 0.02
Polarity sentiment score for 'life': 0.04
Polarity sentiment score for 'disease': 0.01
Polarity sentiment score for 'babylonian': 0.00
Polarity sentiment score for 

In [17]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/cyan'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
  'show', 'atmosphere', 'cloud', 'product', 'film', 'cinecolor', 'ring', 'different', 'planet', 'uranu', 'high', 'oil', 'methane', 'fuel', 'ancient', 'food', 'stove', 'cooking', 'greek', 'natural', 'cyanide', 'pb', 'ferrocyanide', 'ion', 'cn', 'teal', 'iron', 'fe', 'pigment', 'prussian', 'surface', 'air', 'pressure', 'high', 'lapi', 'cyanotype', 'temperature', 'atmosphere', 'earth', 'water', 'organism', 'human', 'visible', 'produce', 'spectrum', 'specie', 'flame', 'bacterial', 'cell', 'bacteria'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'show': 0.01
Polarity sentiment score for 'atmosphere': 0.03
Polarity sentiment score for 'cloud': 0.00
Polarity sentiment score for 'product': 0.01
Polarity sentiment score for 'film': 0.02
Polarity sentiment score for 'cinecolor': 0.00
Polarity sentiment score for 'ring': 0.01
Polarity sentiment score for 'different': 0.02
Polarity sentiment score for 'planet': 0.01
Polarity sentiment score for 'uranu': 0.01
Polarity sentiment score for 'high': 0.02
Polarity sentiment score for 'oil': 0.01
Polarity sentiment score for 'methane': 0.01
Polarity sentiment score for 'fuel': 0.02
Polarity sentiment score for 'ancient': 0.03
Polarity sentiment score for 'food': 0.03
Polarity sentiment score for 'stove': 0.02
Polarity sentiment score for 'cooking': 0.05
Polarity sentiment score for 'greek': 0.00
Polarity sentiment score for 'natural': 0.02
Polarity sentiment score for 'cyanide': 0.01
Polarity sentiment score for 'pb': 0.00
Polarity sentiment score for 'ferrocyan

In [18]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/blue'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
  'kingdom', 'western', 'period', 'middle', 'age', 'empire', 'roman', 'europe', 'santorini', 'island', 'device', 'rgb', 'ocean', 'pigment', 'ultramarine', 'high', 'algae', 'violet', 'water', 'sea', 'effect', 'rayleigh', 'sadnes', 'ware', 'wavelength', 'chinese', 'particle', 'porcelain', 'sky', 'scatter', 'uniform', 'high', 'shot', 'national', 'country', 'argentine', 'ball', 'basketball', 'player', 'argentina', 'pigment', 'preference', 'flagpole', 'product', 'sun', 'crystal', 'different', 'sky', 'primarie', 'halo'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'kingdom': 0.00
Polarity sentiment score for 'western': 0.01
Polarity sentiment score for 'period': 0.01
Polarity sentiment score for 'middle': 0.01
Polarity sentiment score for 'age': 0.01
Polarity sentiment score for 'empire': 0.01
Polarity sentiment score for 'roman': 0.01
Polarity sentiment score for 'europe': 0.00
Polarity sentiment score for 'santorini': 0.00
Polarity sentiment score for 'island': 0.02
Polarity sentiment score for 'device': 0.01
Polarity sentiment score for 'rgb': 0.00
Polarity sentiment score for 'ocean': 0.01
Polarity sentiment score for 'pigment': 0.02
Polarity sentiment score for 'ultramarine': 0.03
Polarity sentiment score for 'high': 0.02
Polarity sentiment score for 'algae': 0.05
Polarity sentiment score for 'violet': 0.04
Polarity sentiment score for 'water': 0.02
Polarity sentiment score for 'sea': 0.02
Polarity sentiment score for 'effect': 0.03
Polarity sentiment score for 'rayleigh': 0.00
Polarity sentiment score for 'sadn

In [19]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/purple'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
  'thread', 'pigment', 'cocoon', 'produce', 'manganese', 'fiber', 'production', 'silkworm', 'violet', 'silk', 'hematite', 'greek', 'woman', 'roman', 'dye', 'constantinople', 'emperor', 'violet', 'empire', 'byzantine', 'win', 'week', 'blackberry', 'playoff', 'blackberrie', 'record', 'nfl', 'baltimore', 'season', 'raven', 'award', 'franchise', 'polari', 'star', 'club', 'nba', 'fremantle', 'player', 'league', 'season', 'dynasty', 'high', 'artist', 'million', 'national', 'country', 'impressionist', 'dye', 'chinese', 'china'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'thread': 0.00
Polarity sentiment score for 'pigment': 0.03
Polarity sentiment score for 'cocoon': 0.00
Polarity sentiment score for 'produce': 0.01
Polarity sentiment score for 'manganese': 0.07
Polarity sentiment score for 'fiber': 0.01
Polarity sentiment score for 'production': 0.02
Polarity sentiment score for 'silkworm': 0.01
Polarity sentiment score for 'violet': 0.14
Polarity sentiment score for 'silk': 0.06
Polarity sentiment score for 'hematite': 0.02
Polarity sentiment score for 'greek': 0.00
Polarity sentiment score for 'woman': 0.01
Polarity sentiment score for 'roman': 0.00
Polarity sentiment score for 'dye': 0.07
Polarity sentiment score for 'constantinople': 0.00
Polarity sentiment score for 'emperor': 0.01
Polarity sentiment score for 'violet': 0.14
Polarity sentiment score for 'empire': 0.02
Polarity sentiment score for 'byzantine': 0.02
Polarity sentiment score for 'win': 0.01
Polarity sentiment score for 'week': 0.00
Polarity sentiment sc

In [20]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/black'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
   'painting', 'propertie', 'high', 'thermal', 'object', 'artist', 'cnt', 'infrared', 'carbon', 'nanotube', 'night', 'source', 'spectrum', 'lamp', 'mourn', 'ultraviolet', 'visible', 'wavelength', 'radiation', 'uv', 'begin', 'practice', 'public', 'religion', 'term', 'empire', 'friday', 'enlightenment', 'roman', 'magic', 'belief', 'pigment', 'witch', 'player', 'union', 'ball', 'grey', 'witchcraft', 'death', 'rugby', 'term', 'western', 'empire', 'dark', 'roman', 'europe', 'middle', 'period', 'evil', 'age'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'painting': 0.01
Polarity sentiment score for 'propertie': 0.01
Polarity sentiment score for 'high': 0.02
Polarity sentiment score for 'thermal': 0.02
Polarity sentiment score for 'object': 0.01
Polarity sentiment score for 'artist': 0.01
Polarity sentiment score for 'cnt': 0.00
Polarity sentiment score for 'infrared': 0.07
Polarity sentiment score for 'carbon': 0.04
Polarity sentiment score for 'nanotube': 0.02
Polarity sentiment score for 'night': 0.04
Polarity sentiment score for 'source': 0.01
Polarity sentiment score for 'spectrum': 0.03
Polarity sentiment score for 'lamp': 0.02
Polarity sentiment score for 'mourn': -0.00
Polarity sentiment score for 'ultraviolet': 0.05
Polarity sentiment score for 'visible': 0.05
Polarity sentiment score for 'wavelength': 0.03
Polarity sentiment score for 'radiation': 0.08
Polarity sentiment score for 'uv': 0.00
Polarity sentiment score for 'begin': 0.00
Polarity sentiment score for 'practice': 0.01
Polarity sentiment

In [21]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/white'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
  'order', 'doric', 'ionic', 'different', 'nao', 'roman', 'building', 'greek', 'column', 'temple', 'animal', 'specie', 'population', 'culture', 'chalk', 'water', 'sea', 'whale', 'beluga', 'pearl', 'radiation', 'ice', 'visible', 'ray', 'cumulu', 'wavelength', 'spectrum', 'water', 'cloud', 'snow', 'built', 'city', 'modernist', 'house', 'architect', 'design', 'style', 'penguin', 'architecture', 'building', 'belief', 'human', 'fat', 'protein', 'soul', 'dead', 'cow', 'spirit', 'ghost', 'milk'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'order': 0.01
Polarity sentiment score for 'doric': 0.00
Polarity sentiment score for 'ionic': 0.01
Polarity sentiment score for 'different': 0.02
Polarity sentiment score for 'nao': 0.03
Polarity sentiment score for 'roman': 0.00
Polarity sentiment score for 'building': 0.03
Polarity sentiment score for 'greek': 0.00
Polarity sentiment score for 'column': 0.01
Polarity sentiment score for 'temple': 0.04
Polarity sentiment score for 'animal': 0.00
Polarity sentiment score for 'specie': 0.01
Polarity sentiment score for 'population': 0.01
Polarity sentiment score for 'culture': 0.01
Polarity sentiment score for 'chalk': 0.03
Polarity sentiment score for 'water': 0.02
Polarity sentiment score for 'sea': 0.02
Polarity sentiment score for 'whale': 0.01
Polarity sentiment score for 'beluga': 0.02
Polarity sentiment score for 'pearl': 0.05
Polarity sentiment score for 'radiation': 0.03
Polarity sentiment score for 'ice': 0.01
Polarity sentiment score for 'visible

In [22]:
def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    document_texts.append(text)
    
    return document_texts,document_labels

folder_path = '/Users/yinshuodi/Desktop/mini-project-23006440/Webcrawling/code&data/data/my-data/pink'
document_texts,document_labels = load_text_documents(folder_path)

def get_polarity(content):
    return TextBlob(content).sentiment.polarity

vectorizer = TfidfVectorizer()
tf_idf = vectorizer.fit_transform(document_texts)
vocab  = vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab)
tfidf_df

target_words = [
  'lead', 'woman', 'cosmetic', 'pigment', 'lip', 'associate', 'boy', 'girl', 'magenta', 'lipstick', 'manganese', 'food', 'stand', 'specie', 'chick', 'nest', 'tongue', 'rhodochrosite', 'bird', 'flamingo', 'presl', 'hamilton', 'specie', 'blossom', 'angiosperm', 'william', 'flower', 'nelson', 'plant', 'emma', 'artist', 'meaning', 'different', 'show', 'object', 'product', 'study', 'inter', 'club', 'miami', 'serie', 'child', 'social', 'club', 'sex', 'palermo', 'doll', 'barbie', 'toy', 'gender'
]

for word in target_words:
    if word in vocab:
        word_index = list(vocab).index(word)
        sentiment_score = 0

    for document, content in enumerate(document_texts):
        if word in content:
            document_sentiment = get_polarity(content)
            word_tfidf = tf_idf[document, word_index]
            sentiment_score += document_sentiment * word_tfidf

    print(f"Polarity sentiment score for '{word}': {sentiment_score:.2f}")


Polarity sentiment score for 'lead': 0.01
Polarity sentiment score for 'woman': 0.00
Polarity sentiment score for 'cosmetic': 0.01
Polarity sentiment score for 'pigment': 0.01
Polarity sentiment score for 'lip': 0.01
Polarity sentiment score for 'associate': 0.00
Polarity sentiment score for 'boy': 0.00
Polarity sentiment score for 'girl': 0.00
Polarity sentiment score for 'magenta': 0.09
Polarity sentiment score for 'lipstick': 0.05
Polarity sentiment score for 'manganese': 0.03
Polarity sentiment score for 'food': 0.01
Polarity sentiment score for 'stand': 0.00
Polarity sentiment score for 'specie': 0.01
Polarity sentiment score for 'chick': 0.00
Polarity sentiment score for 'nest': 0.01
Polarity sentiment score for 'tongue': 0.01
Polarity sentiment score for 'rhodochrosite': 0.05
Polarity sentiment score for 'bird': 0.00
Polarity sentiment score for 'flamingo': 0.03
Polarity sentiment score for 'presl': 0.00
Polarity sentiment score for 'hamilton': 0.00
Polarity sentiment score for 