In [112]:
import pandas as pd
import numpy as np
from umap import UMAP
import plotly.express as px
import os
import pickle
import datetime

from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import openai
import tiktoken
from openai.embeddings_utils import get_embedding

api_key = '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'
openai.api_key = api_key

You'll follow the steps of the data science process that we've discussed:

You will first **define** the problem you want to solve and investigate potential solutions.

Next, you will **analyze** the problem through visualizations and data exploration to have a better understanding of what algorithms and features are appropriate for solving it.

You will then **implement** the algorithms and metrics of your choice, documenting the preprocessing, refinement, and post-processing steps along the way.

Afterwards, you will collect **results** about your findings, visualize significant quantities, validate/justify your results, and make any concluding remarks about whether your implementation adequately solves the problem.

Finally, you will **construct** a blog post (or other medium for a write-up) to document all of the steps from start to finish of your project, or deploy your results into a web application.

In [132]:
class MediaBiasModel:

    def __init__(self, label='mediabiasmodel'):
        '''Initializes a model and loads the media bias dataset.'''
        self.label='mediabiasmodel'

        # set the embedding model parameters
        self.embedding_model = "text-embedding-ada-002"
        embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
        self.max_tokens = 8000
        self.encoding = tiktoken.get_encoding(embedding_encoding)


    def load_data(self):
        '''
        Loads the dataset required for training the model and remaps the topics and media bias to simplified labels.
        '''

        media_bias_map = {
            'HuffPost': 'left',
            'Federalist': 'right',
            'Daily Beast': 'left',
            'Alternet': 'left',
            'Breitbart': 'right',
            'New Yorker': 'left',
            'American Greatness': 'right', # from https://mediabiasfactcheck.com/american-greatness/
            'Daily Caller': 'right',
            'Daily Wire': 'right',
            'Slate': 'left',
            'Reuters': 'center',
            'Hill': 'center', # from https://mediabiasfactcheck.com/the-hill/
            'USA Today': 'left',
            'CNBC': 'left',
            'Yahoo News - Latest News & Headlines': 'left',
            'AP': 'left',
            'Bloomberg': 'left',
            'Fox News': 'right',
            'MSNBC': 'left',
            'Daily Stormer': 'right', # from https://mediabiasfactcheck.com/the-hill/
            'New York Times': 'left'
            }

        new_topics_map = {
        '#metoo':'activism',
        'abortion':'abortion',
        'black lives matter':'activism',
        'blm':'activism',
        'coronavirus':'coronavirus-and-vaccines',
        'elections-2020':'politics',
        'environment':'environment',
        'gender':'socioeconomics',
        'gun control':'gun-control',
        'gun-control':'gun-control',
        'immigration':'immigration',
        'international-politics-and-world-news':'politics',
        'islam':'islam',
        'marriage-equality':'activism',
        'middle-class':'socioeconomics',
        'sport':'sport',
        'student-debt':'socioeconomics',
        'taxes':'socioeconomics',
        'trump-presidency':'politics',
        'universal health care':'universal-health-care',
        'vaccine':'coronavirus-and-vaccines',
        'vaccines':'coronavirus-and-vaccines',
        'white-nationalism':'white-nationalism',
        }

        df = pd.read_csv('../data/sentences_embeddings.csv')
        if 'embedding' in df.columns:
            df["embedding"] = df.embedding.apply(eval).apply(np.array)  # convert string to array
        
        df = df.rename(columns={'topic':'topic_original'})
        df['topic'] = df['topic_original'].map(new_topics_map)
        df['outlet_bias'] = df['outlet'].map(media_bias_map)
        self.df = df

    def transform_data(self, input_df=None):
        '''Transforms the sentence data with OpenAI ADA embeddings.'''
        # read the API KEY from an environment variable and set it in openai
        if input_df is None:
            if 'embedding' in self.df.columns:
                return self.df
            else:
                # omit reviews that are too long to embed
                self.df["n_tokens"] = self.df.text.apply(lambda x: len(self.encoding.encode(x)))
                self.df = self.df[self.df.n_tokens <= self.max_tokens]

                # get embedding
                self.df["embedding"] = self.df.text.apply(lambda x: get_embedding(x, engine=self.embedding_model))
                return self.df
        else:
            input_df["n_tokens"] = input_df.text.apply(lambda x: len(self.encoding.encode(x)))
            input_df = input_df[input_df.n_tokens <= self.max_tokens]

            # get embedding
            input_df["embedding"] = input_df.text.apply(lambda x: get_embedding(x, engine=self.embedding_model))
            return input_df

    
    def get_text_embedding(self, text):
        '''Gets the embedding of a single sentence.'''

        n_tokens = len(self.encoding.encode(text))
        print(n_tokens)
        if n_tokens > self.max_tokens:
            raise ValueError(f'The input text is too long and resulting in {n_tokens} tokens. The maximum number of tokens for this model is 8000.')
        
        return get_embedding(text, engine=self.embedding_model)

    def fit_models(self):
        '''
        Fits the classifiers for topic, bias and outlet bias on the training data.
        '''
        self.topic_model = KNeighborsClassifier(n_neighbors=15)
        self.bias_model = LogisticRegression(C=0.0002, penalty='l2')
        self.politics_model = MLPClassifier(hidden_layer_sizes=(128, 128), max_iter=300)

        print('Fitting topic classifier...')
        self.topic_model.fit(np.vstack(self.df.embedding.values), self.df.topic)
        print('Fitting bias classifier...')
        self.bias_model.fit(np.vstack(self.df.embedding.values), self.df.label_bias)
        print('Fitting political bias classifier...')
        self.politics_model.fit(np.vstack(self.df.embedding.values), self.df.outlet_bias)
        
        print('Model fitting complete. To save the current classifiers, run model.save().')

    def load_models(self, model_tag):
        '''
        Loads pre-trained topic, bias and political bias models. 
        Requires a user-specified model tag to correctly identify models.
        '''
        self.model_tag = model_tag
        self.topic_model = pickle.load(open(f'model_topic_{model_tag}.pickle', 'rb'))
        self.bias_model = pickle.load(open(f'model_bias_{model_tag}.pickle', 'rb'))
        self.politics_model = pickle.load(open(f'model_politics_{model_tag}.pickle', 'rb'))
        

    def save_models(self, model_tag=None):
        '''
        Saves the model to a pickle file and makes it available for reuse without re-training.
        '''
        if model_tag is None:
            # get a datetime tag to assign to model
            model_tag = datetime.datetime.now().strftime("%m%d%Y_%H%M%S")

        self.model_tag = model_tag
        pickle.dump(self.topic_model, open(f'model_topic_{model_tag}.pickle', 'wb'))
        pickle.dump(self.bias_model, open(f'model_bias_{model_tag}.pickle', 'wb'))
        pickle.dump(self.politics_model, open(f'model_politics_{model_tag}.pickle', 'wb'))

    def predict_labels_text(self, sentence):
        '''
        Predicts the topic, bias and outlet bias of a single sentence.
        '''
        embedding = np.array(self.get_text_embedding(sentence))
        topic = self.topic_model.predict(embedding.reshape(1, -1))
        bias = self.bias_model.predict(embedding.reshape(1, -1))
        politics = self.politics_model.predict(embedding.reshape(1, -1))

        return topic, bias, politics

    def predict_labels_df(self, input_df):
        '''
        Transforms text into embeddings and predicts the topic, bias and outlet bias of the 'text' column of an input dataframe.
        '''
        input_df = self.transform_data(input_df=input_df)
        input_df['topic'] = self.topic_model.predict(np.vstack(input_df.embedding.values))
        input_df['label_bias'] = self.bias_model.predict(np.vstack(input_df.embedding.values))
        input_df['outlet_bias'] = self.politics_model.predict(np.vstack(input_df.embedding.values))

        return input_df

    def analyze_full_article(self, article_text):
        '''
        Splits the input article text into sentences and analyzes each one separately for bias.
        '''
        # transform the article into sentence data and get labels
        sentences = article_text.replace('\n','').replace('U.S.','United States').split('.')
        df_sentences = pd.DataFrame(sentences, columns=['text'])
        df_sentences = self.predict_labels_df(df_sentences)

        return df_sentences




In [35]:
model = MediaBiasModel()

In [36]:
model.transform_data()

Unnamed: 0.1,Unnamed: 0,text,news_link,outlet,topic_original,type,label_bias,label_opinion,biased_words,n_tokens,embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z,topic,outlet_bias
0,0,"""Orange Is the New Black"" star Yael Stone is r...",https://www.foxnews.com/entertainment/australi...,Fox News,environment,right,Non-biased,Entirely factual,[],33,"[-0.0019234662177041173, -0.04733271151781082,...",2.172204,3.680499,3.263189,4.458643,3.111911,environment,right
1,1,"""We have one beautiful law,"" Trump recently sa...",https://www.alternet.org/2020/06/law-and-order...,Alternet,gun control,left,Biased,Somewhat factual but also opinionated,"['bizarre', 'characteristically']",25,"[-0.02060660719871521, 0.01556632574647665, 0....",7.181209,6.020027,7.151180,6.310200,2.527700,gun-control,left
2,2,"...immigrants as criminals and eugenics, all o...",https://www.nbcnews.com/news/latino/after-step...,MSNBC,white-nationalism,left,Biased,Expresses writer’s opinion,"['criminals', 'fringe', 'extreme']",31,"[-0.010726450942456722, -0.029740046709775925,...",8.432751,5.749578,8.363820,6.083026,3.141858,white-nationalism,left
3,3,...we sounded the alarm in the early months of...,https://www.alternet.org/2019/07/fox-news-has-...,Alternet,white-nationalism,left,Biased,Somewhat factual but also opinionated,[],59,"[0.007879192940890789, -0.02152813784778118, 0...",8.021709,5.698002,7.919033,5.909703,3.103915,white-nationalism,left
4,4,[Black Lives Matter] is essentially a non-fals...,http://feedproxy.google.com/~r/breitbart/~3/-v...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['cult'],28,"[-0.04406517744064331, -0.013899666257202625, ...",9.504386,6.213847,9.402060,6.063433,2.734318,activism,right
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3669,3669,You’ve heard of Jim Crow and Southern Segregat...,http://feedproxy.google.com/~r/breitbart/~3/ei...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,['ALL'],17,"[-0.014510887674987316, -0.024596648290753365,...",7.070063,6.401396,7.602796,6.397684,3.698412,activism,right
3670,3670,Young female athletes’ dreams and accomplishme...,http://feedproxy.google.com/~r/breitbart/~3/eW...,Breitbart,marriage-equality,,Biased,Expresses writer’s opinion,"['dashed', '""identify""']",25,"[-0.03523683175444603, -0.02459663897752762, 0...",10.647132,3.889740,9.332088,3.518990,2.775394,activism,right
3671,3671,"Young white men, reacting to social and educat...",https://thefederalist.com/2016/05/23/how-anti-...,Federalist,white-nationalism,right,Biased,Expresses writer’s opinion,"['evil', 'white']",28,"[-0.011496055871248245, -0.04264130815863609, ...",8.605273,5.729459,8.479422,6.053718,3.204368,white-nationalism,right
3672,3672,Young women taking part in high school and col...,,Breitbart,sport,right,Biased,Somewhat factual but also opinionated,"['dashed', '""identify""']",38,"[-0.0370485894382, -0.031217575073242188, 0.02...",10.673921,3.946843,9.370227,3.514213,2.689805,sport,right


In [37]:
model.fit_models()

Fitting topic classifier...
Fitting bias classifier...
Fitting political bias classifier...
Model fitting complete. To save the current classifiers, run model.save().


In [38]:
model.save_models(model_tag='test')

In [133]:
model2 = MediaBiasModel()

In [135]:
model2.load_models(model_tag='test')

In [49]:
text_data = model.df[['text', 'topic', 'label_bias', 'label_opinion']]

In [50]:
text_data

Unnamed: 0,text,topic,label_bias,label_opinion
0,"""Orange Is the New Black"" star Yael Stone is r...",environment,Non-biased,Entirely factual
1,"""We have one beautiful law,"" Trump recently sa...",gun-control,Biased,Somewhat factual but also opinionated
2,"...immigrants as criminals and eugenics, all o...",white-nationalism,Biased,Expresses writer’s opinion
3,...we sounded the alarm in the early months of...,white-nationalism,Biased,Somewhat factual but also opinionated
4,[Black Lives Matter] is essentially a non-fals...,activism,Biased,Expresses writer’s opinion
...,...,...,...,...
3669,You’ve heard of Jim Crow and Southern Segregat...,activism,Biased,Expresses writer’s opinion
3670,Young female athletes’ dreams and accomplishme...,activism,Biased,Expresses writer’s opinion
3671,"Young white men, reacting to social and educat...",white-nationalism,Biased,Expresses writer’s opinion
3672,Young women taking part in high school and col...,sport,Biased,Somewhat factual but also opinionated


# Make ChatGPT database for testing

In [51]:
text_data['text'][0]

'"Orange Is the New Black" star Yael Stone is renouncing her U.S. green card to return to her native Australia in order to fight climate change.'

In [53]:
gpt_0 = "Renowned actress Yael Stone, best known for her role in the hit series 'Orange Is the New Black', has made a bold decision to renounce her U.S. green card and return to her native Australia. The motivation behind her move is none other than the pressing issue of climate change."

In [64]:
gpt_0

"Renowned actress Yael Stone, best known for her role in the hit series 'Orange Is the New Black', has made a bold decision to renounce her U.S. green card and return to her native Australia. The motivation behind her move is none other than the pressing issue of climate change."

In [79]:
gpt_0_embedding = model2.get_text_embedding(gpt_0)

60


In [80]:
model2.predict_labels_text(gpt_0)

60


(array(['environment'], dtype=object),
 array(['Non-biased'], dtype=object),
 array(['center'], dtype='<U6'))

In [81]:
gpt_0_rw = "Stone's move has sparked a vibrant conversation within conservative circles, with many applauding her for standing up for individual rights and expressing dissent against prevailing narratives. By choosing to return to Australia, she affirms her connection to her homeland and the belief that local communities should have the power to address environmental issues in ways that suit their specific needs."

In [82]:
model2.predict_labels_text(gpt_0_rw)

67


(array(['environment'], dtype=object),
 array(['Non-biased'], dtype=object),
 array(['center'], dtype='<U6'))

In [83]:
biased_words = pd.read_excel('../data/bias_word_lexicon.xlsx', header=None)

In [93]:
full_article = '''
Talented actress Yael Stone, known for her role in the popular series "Orange Is the New Black," has taken a bold stance in favor of personal freedom and individual responsibility. In a surprising move, Stone has renounced her U.S. green card and decided to return to her native Australia.

Stone's decision is rooted in her belief that individuals should have the autonomy to make choices that align with their values and priorities. By relinquishing her U.S. green card, she symbolizes a commitment to personal sovereignty and a rejection of perceived government overreach.

While climate change is a widely discussed topic, Stone's decision should be viewed as a reflection of her personal convictions rather than a blanket endorsement of climate change concerns. In an era marked by political polarization, her choice to prioritize personal freedom over perceived environmental crises resonates with like-minded individuals who value individual agency and limited government interference.

Stone's move has sparked a vibrant conversation within conservative circles, with many applauding her for standing up for individual rights and expressing dissent against prevailing narratives. By choosing to return to Australia, she affirms her connection to her homeland and the belief that local communities should have the power to address environmental issues in ways that suit their specific needs.

As Yael Stone continues her career, her decision to renounce her U.S. green card will undoubtedly inspire others to question the balance between personal freedom and societal expectations. While some may interpret her choice as an act of defiance, it serves as a reminder that personal convictions and individual choices should be celebrated, even if they diverge from prevailing narratives.

In a world where personal liberties and diverse viewpoints are valued, Yael Stone's decision to renounce her U.S. green card showcases the importance of individual agency and the power of personal convictions. Her story serves as a catalyst for robust discussions about the role of government and the importance of personal freedom in shaping our own destinies.

'''

In [96]:
import re
article_stripped = re.sub(r'[^a-zA-Z0-9\s]', '', full_article)
# Convert to lowercase
article_lower = article_stripped.lower()

print(article_lower)


talented actress yael stone known for her role in the popular series orange is the new black has taken a bold stance in favor of personal freedom and individual responsibility in a surprising move stone has renounced her us green card and decided to return to her native australia

stones decision is rooted in her belief that individuals should have the autonomy to make choices that align with their values and priorities by relinquishing her us green card she symbolizes a commitment to personal sovereignty and a rejection of perceived government overreach

while climate change is a widely discussed topic stones decision should be viewed as a reflection of her personal convictions rather than a blanket endorsement of climate change concerns in an era marked by political polarization her choice to prioritize personal freedom over perceived environmental crises resonates with likeminded individuals who value individual agency and limited government interference

stones move has sparked a 

In [97]:
biased_word_counter = 0 
biased_words_in_article = []
for word in article_lower.split(' '):
    if word in biased_words:
        biased_word_counter += 1
        biased_words_in_article.append(word)

In [98]:
biased_word_counter

0

In [99]:
biased_words_in_article

[]

In [101]:
sentences = full_article.split('.')

In [102]:
cc = "As conservatives, we recognize the importance of responsible environmental stewardship, but we reject the left's alarmist agenda that undermines economic prosperity and individual freedoms. It is crucial to engage in open dialogue, foster scientific debate, and consider a range of perspectives. By focusing on innovation, market-driven solutions, and equitable international agreements, we can strike a balance between environmental protection and the well-being of our citizens."

In [103]:
model2.predict_labels_text(cc)

80


(array(['environment'], dtype=object),
 array(['Non-biased'], dtype=object),
 array(['right'], dtype='<U6'))

In [104]:
cc_l = "The urgency of the climate crisis demands bold, transformative policies that challenge the status quo. We must resist the influence of fossil fuel lobbyists and push for the rapid phasing out of coal, oil, and gas. By investing in green infrastructure, sustainable agriculture, and nature-based solutions, we can create a resilient future that protects both people and the planet."

In [105]:
model2.predict_labels_text(cc_l)

70


(array(['environment'], dtype=object),
 array(['Non-biased'], dtype=object),
 array(['left'], dtype='<U6'))

In [119]:
sentences = full_article.replace('\n','').replace('U.S.','United States').split('.')
df_sentences = pd.DataFrame(sentences, columns=['text'])

In [136]:
model2 = MediaBiasModel()

In [137]:
model2.load_models(model_tag='test')

In [139]:
df_sentences = model2.predict_labels_df(df_sentences[df_sentences['text_length']>1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df["n_tokens"] = input_df.text.apply(lambda x: len(self.encoding.encode(x)))


In [143]:
df_sentences['label_bias'].mode()

0    Non-biased
Name: label_bias, dtype: object

In [144]:
df_sentences['outlet_bias'].mode()

0    left
Name: outlet_bias, dtype: object

In [145]:
df_sentences['topic'].mode()

0    environment
Name: topic, dtype: object

In [121]:
df_sentences[df_sentences['text']]

Unnamed: 0,text,n_tokens
0,"Talented actress Yael Stone, known for her rol...",36
1,"In a surprising move, Stone has renounced her...",22
2,Stone's decision is rooted in her belief that ...,24
3,By relinquishing her United States green card...,25
4,While climate change is a widely discussed top...,31
5,"In an era marked by political polarization, h...",32
6,Stone's move has sparked a vibrant conversatio...,29
7,"By choosing to return to Australia, she affir...",36
8,"As Yael Stone continues her career, her decisi...",32
9,While some may interpret her choice as an act...,35


In [126]:
df_sentences['text_length'] = df_sentences['text'].apply(lambda x: len(x))