In [48]:
import re, requests, urllib.parse
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


def cleaning(data):
    # Remove punctuation
    data = re.sub("[^a-zA-Z]", " ", data)

    # Lowercasing
    data = data.lower()

    # Tokenizing
    words = word_tokenize(data)

    # Remove stopwords
    stopwords_list = requests.get(
        "https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"
    ).content
    stopwords = set(stopwords_list.decode().splitlines())
    words = [word for word in words if not word in stopwords]

    # Lemmatizing
    lem = WordNetLemmatizer()
    words = [lem.lemmatize(word) for word in words]

    # Back to a String
    cleaned_data = " ".join(words)

    return cleaned_data

import pandas as pd

df = pd.read_csv("news.csv")
df = df.head(5)
df["Cleaned_Title"] = df["Title"].apply(cleaning)
df["Cleaned_Content"] = df["Content"].apply(cleaning)
df["Text"] = df["Cleaned_Title"] + " " + df["Cleaned_Content"]
df.head(5)

Unnamed: 0,Source,URL,Title,Date,Content,Cleaned_Title,Cleaned_Content,Text
0,theguardian,https://www.theguardian.com/commentisfree/2023...,"As the SNP loses its iron grip on Scotland, La...",9/4/2023,What has been seen cannot be unseen. Some imag...,snp loses iron grip scotland labour seize gold...,unseen image potent indelibly etched nation re...,snp loses iron grip scotland labour seize gold...
1,asahi,https://www.asahi.com/ajw/articles/14880931,‘Abenomask’ giveaway details finally disclosed...,8/4/2023,"Forced into a corner by a court, the governmen...",abenomask giveaway detail finally disclosed co...,forced corner court government grudgingly rele...,abenomask giveaway detail finally disclosed co...
2,rthk,https://news.rthk.hk/rthk/en/component/k2/1695...,Families flock to cemeteries to mark Ching Ming,5/4/2023,Large numbers of people flocked to Hong Kong's...,family flock cemetery mark ching ming,large number people flocked hong kong cemetery...,family flock cemetery mark ching ming large nu...
3,japantimes,https://www.japantimes.co.jp/news/2023/04/03/b...,Japan firms roll back COVID measures for new r...,3/4/2023,A number of companies throughout Japan decided...,japan firm roll covid measure recruit event,number company japan decided dispense covid he...,japan firm roll covid measure recruit event nu...
4,foxnews,https://www.foxnews.com/opinion/i-treated-2000...,"I treated 20,000 COVID patients and 3 years af...",2/4/2023,Fox News medical contributor Dr. Janette Neshe...,treated covid patient year lockdown,fox news medical contributor janette nesheiwat...,treated covid patient year lockdown fox news m...


In [49]:
import pandas as pd
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Function to extract GPE entities using NLTK NER
def ner(data):
    # Tokenize the text
    tokens = nltk.word_tokenize(data)
    
    # Part-of-speech tagging
    pos_tags = nltk.pos_tag(tokens)
    
    # Perform named entity recognition
    chunks = nltk.ne_chunk(pos_tags)
    
    # Extract GPE entities
    gpe_entities = []
    for chunk in chunks:
        if hasattr(chunk, "label") and chunk.label() == "GPE":
            gpe_entities.append(" ".join(word for word, tag in chunk.leaves()))
    
    # Return a list of all GPE entities found in the text
    return gpe_entities

df["Regions"] = df["Content"].apply(ner)

for i in range(5):
    print(df.loc[i].Regions)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\h1enr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\h1enr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\h1enr\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\h1enr\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


['Glasgow', 'Scotland', 'Unforgotten', 'Edinburgh', 'Scotland', 'Scotland', 'Scotland', 'Scotland', 'Scotland', 'Labour', 'England', 'Scotland', 'Labour', 'Rutherglen', 'Scotland']
['Japan', 'Abenomasks', 'Audit']
['Large', 'Hong Kong', 'Diamond Hill', 'Covid', 'Hong Kong']
['Japan', 'Japan']
['News', 'Narcan', 'Spanish', 'U.S.', 'Washington', 'United States', 'Normal', 'Americans', 'America', 'Americans', 'New York', 'New York', 'New York']


In [53]:
import en_core_web_sm

def ner(data):
    # Load the model
    nlp = en_core_web_sm.load()
    # Process the text with the model
    doc = nlp(data)

    # Return a list of all GPE entities found in the text
    return [ent.text for ent in doc.ents if ent.label_ == "GPE"]

df["Regions"] = df["Content"].apply(lambda x: ner(x))

for i in range(5):
    print(df.loc[i].Regions)

['Glasgow', 'Scotland', 'UK', 'Scotland', 'Westminster', 'Edinburgh', 'Scotland', 'Scotland', 'Yousaf', 'Scotland', 'Scotland', 'Yousaf', 'Scotland', 'Westminster', 'Scotland', 'Westminster', 'England', 'Scotland', 'Scotland']
['Kamiwaki', 'Japan']
["Hong Kong's", 'Diamond Hill', 'Hong Kong']
['Japan', 'Japan']
['U.S.', 'Washington, DC', 'the United States', 'IV', 'America', 'Philadelphia', 'New York', 'New York', 'New York']


In [54]:
import pandas as pd
from flair.data import Sentence
from flair.models import SequenceTagger

def ner(df):
    # load tagger
    tagger = SequenceTagger.load("flair/ner-english-fast")

    # create an empty list to store all the LOC entities
    locs_all = []

    # process each row of the DataFrame
    for index, row in df.iterrows():
        # get the text from the 'Content' column
        text = row["Content"]
        
        

        # create a new sentence object
        sentence = Sentence(text)
        

        # predict NER tags
        tagger.predict(sentence)

        # extract LOC entities
        locs = [
            entity.text for entity in sentence.get_spans("ner") if entity.tag == "LOC"
        ]

        print(locs)

        # add the LOC entities to the list
        locs_all.append(locs)

    return locs_all

df["Regions"] = ner(df)



2023-04-11 00:34:08,787 loading file C:\Users\h1enr\.flair\models\ner-english-fast\4c58e7191ff952c030b82db25b3694b58800b0e722ff15427f527e1631ed6142.e13c7c4664ffe2bbfa8f1f5375bd0dced866b8c1dd7ff89a6d705518abf0a611
2023-04-11 00:34:10,363 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
['Glasgow', 'Scotland', 'UK', 'Westminster', 'Edinburgh', 'Scotland', 'Holyrood', 'Scotland', 'Westminster', 'Scotland', 'Bute House', 'Scotland', 'Westminster', 'Blackpool South', 'Scotland', 'Westminster', 'England', 'Scotland', 'Rutherglen', 'Hamilton West', 'Scotland']
['Japan']
['Hong Kong', 'Junk Bay', 'Wo Hop Shek', 'Diamond Hill', 'Hong Kong']
['Japan', 'Japan']
['U.S.', 'White House', 'Washington', 'DC', 'United States', 'America', 'Philadelphia', 'New York', 'New York', 'New York']


In [4]:
import pandas as pd
df = pd.read_csv("analysis_ready.csv")
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3157 entries, 0 to 3156
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Source           3157 non-null   object 
 1   URL              3157 non-null   object 
 2   Title            3157 non-null   object 
 3   Date             3157 non-null   object 
 4   Content          3157 non-null   object 
 5   Cleaned_Title    3157 non-null   object 
 6   Cleaned_Content  3157 non-null   object 
 7   Text             3157 non-null   object 
 8   Regions          3157 non-null   object 
 9   Mode             3157 non-null   object 
 10  sadness          3157 non-null   float64
 11  joy              3157 non-null   float64
 12  love             3157 non-null   float64
 13  anger            3157 non-null   float64
 14  fear             3157 non-null   float64
 15  surprise         3157 non-null   float64
 16  Sentiment        3157 non-null   object 
 17  Confidence    

In [5]:
df.to_csv("analysis_ready.csv")