# CNN and Fox data analysis

## CNN

### Yearly （2005-2022）

In [1]:
#2005-2022, yearly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [2]:
from textblob import TextBlob
import pandas as pd
from pyecharts.charts import Pie,Bar,Geo,Map,Boxplot,Page,WordCloud,Grid,Line,Funnel,Timeline
import pyecharts.options as opts
from pyecharts.globals import ThemeType
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ChartType
import pyLDAvis.sklearn
import pyLDAvis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("./data/news_cnn.csv")
df.dropna(inplace=True)
#deal time variable
df['date'] = pd.to_datetime(df['date']).dt.year

In [4]:
print(len(df))

19991


In [5]:
#sentiment analysis for headline
headline = {}
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date']==year]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    headline[str(year)] = temp

In [6]:
box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    # There are only 2 columns in the year of 2010 and it causes errors, so strarts from 2011 
    box_plot.add_xaxis(xaxis_data=list(headline.keys())[1:])
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="headline Emotional distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}:Series name，{b}:data name，{c}:Numeric array，{d}:none

box_plot.render(r"./cnn_html/year_emotion/headline.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/year_emotion/headline.html'

In [7]:
body_1 = {} # Holistic analysis
body_2 = {} # Mean-taking analysis

In [8]:
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date']==year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    body_1[str(year)] = temp

In [9]:
box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="Holistic analysis distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}:Series name，{b}:data name，{c}:Numeric array，{d}:none

box_plot.render(r"./cnn_html/year_emotion/body_1.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/year_emotion/body_1.html'

In [10]:
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date']==year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num)/len(num),2))

    body_2[str(year)] = temp



In [11]:
box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="Mean-taking analysis"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}:Series name，{b}:data name，{c}:Numeric array，{d}:none

box_plot.render(r"./cnn_html/year_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/year_emotion/body_2.html'

In [12]:
#clustering
#statisitcs and data visulization
#results interpretation

In [13]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature word TF-IDF matrix
    X = tf_idf.toarray()

    # The number of topics selected
    n_topics = 5
    # The number of first n_top_words headings for each topic to be output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA the resulting TF-IDF matrix
    lda.fit(tf_idf)


    # Use pyLDAvis for visualization
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './cnn_html/year_lda/lda_{}.html'.format(year))
    except:
        print(year)

In [14]:
df['keyword'].value_counts()#The last keyword is too scarce so discarded

Climate change             9676
Extreme weather            1845
Environmental crisis       1063
Carbon emissions            942
Climate action              932
Climate policy              883
Environmental policy        867
Renewable energy            846
Sustainable development     842
Paris Agreement             839
Climate science             441
Sea level rise              406
Greenhouse gases            228
Biodiversity loss            66
Climate advocacy             58
Climate adaptation           56
COP15                         1
Name: keyword, dtype: int64

In [15]:
#count the number of times each of the 20 themes appears, and plot them in years. time series
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword']==k].groupby('date')['keyword'].count().sort_index()
    line = (
            Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
            .add_xaxis(temp.index.astype('str').tolist())
            .add_yaxis(k, temp.values.tolist(), is_smooth=True)
            .set_series_opts(
                areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
                label_opts=opts.LabelOpts(is_show=False),
            )
            .set_global_opts(
                title_opts=opts.TitleOpts(title="quantity distribution"),
                xaxis_opts=opts.AxisOpts(
                    axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                    is_scale=False,
                    boundary_gap=False,
                    name='time',
                    name_location='middle',
                    name_gap=30,  # The distance between the label and the axis is 20 by default. It is best not to set 20
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16  # Tag font size
                    )),

                yaxis_opts=opts.AxisOpts(
                    name='quantity',
                    name_location='middle',
                    name_gap=30,
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16
                        # font_weight='bolder',
                    )),
                # toolbox_opts=opts.ToolboxOpts() # Tool options
            )
        )
    line.render(f"./cnn_html/year_key/{k}.html")


In [16]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis(temp.index.astype('str').tolist())
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  # The distance between the label and the axis is 20 by default. It is best not to set 20
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  # Tag font size
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() # Tool options
        )
    )

for ke in key:
    num = {}
    for i in range(2011,2023):
        num[i] = 0
    temp = df[df['keyword']==ke].groupby('date')['keyword'].count().sort_index()
    for k,v in temp.to_dict().items():
        if k in num:
            num[k] = v

    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./cnn_html/year_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/year_key/5_key.html'

### Monthly （2019， 2021， 2022）

In [17]:
#2019-2022, monthly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [18]:
df = pd.read_csv("./data/news_cnn.csv")
df.dropna(inplace=True)
df['year'] =  pd.to_datetime(df['date']).dt.year
df = df[df['year']>=2019]
#deal with time
df['date'] = pd.to_datetime(df['date']).dt.year.astype(str) + '/' + pd.to_datetime(df['date']).dt.month.astype(str)

In [19]:
y_time = ['2019/1', '2019/2', '2019/3', '2019/4', '2019/5', '2019/6', '2019/7', '2019/8', '2019/9','2019/10', '2019/11', '2019/12', '2020/1', '2020/2', '2020/3', '2020/4', '2020/5', '2020/6', '2020/7', '2020/8', '2020/9','2020/10', '2020/11', '2020/12', '2021/1',  '2021/2', '2021/3', '2021/4', '2021/5', '2021/6', '2021/7', '2021/8', '2021/9', '2021/10', '2021/11', '2021/12','2022/1',  '2022/2', '2022/3', '2022/4', '2022/5', '2022/6', '2022/7', '2022/8', '2022/9','2022/10', '2022/11', '2022/12',]

In [20]:
headline = {}
for y in  y_time:
    temp = []
    for each in df[df['date']==y]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        headline[str(y)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    # In 2010, there were only 2 pieces of data, cause an error, so start from 2011
    box_plot.add_xaxis(xaxis_data=list(headline.keys()))
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="headline Emotional distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}: series name, {b}: data name, {c}: array of values, {d}: none

box_plot.render(r"./cnn_html/month_emotion/headline.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/month_emotion/headline.html'

In [21]:
body_1 = {} # Holistic analysis
body_2 = {} # Mean-taking analysis
for year in  y_time:
    temp = []
    for each in df[df['date']==year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        body_1[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="Holistic analysis distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}: series name, {b}: data name, {c}: array of values, {d}: none

box_plot.render(r"./cnn_html/month_emotion/body_1.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/month_emotion/body_1.html'

In [22]:
for year in y_time:
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num) / len(num), 2))
    if len(temp) > 5:
        body_2[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Mean-taking analysis"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  # {a}: series name, {b}: data name, {c}: array of values, {d}: none

box_plot.render(r"./cnn_html/month_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/month_emotion/body_2.html'

In [23]:
#clustering
#statisitcs and data visulization
#results interpretation

In [24]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  y_time:
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature word TF-IDF matrix
    X = tf_idf.toarray()

    # The number of topics selected
    n_topics = 5
    # The number of first n_top_words headings for each topic to be output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA the resulting TF-IDF matrix
    lda.fit(tf_idf)


    # Use pyLDAvis for visualization
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './cnn_html/month_lda/lda_{}.html'.format(year.replace('/','-')))
    except:
        print(year)

In [25]:
#count the number of times each of the 20 topics appears, and plot it by month
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword']==k].groupby('date')['keyword'].count().sort_index()
    line = (
            Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
            .add_xaxis(temp.index.astype('str').tolist())
            .add_yaxis(k, temp.values.tolist(), is_smooth=True)
            .set_series_opts(
                areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
                label_opts=opts.LabelOpts(is_show=False),
            )
            .set_global_opts(
                title_opts=opts.TitleOpts(title="quantity distribution"),
                datazoom_opts=opts.DataZoomOpts(),
                xaxis_opts=opts.AxisOpts(
                    axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                    is_scale=False,
                    boundary_gap=False,
                    name='time',
                    name_location='middle',
                    name_gap=30,  # The distance between the label and the axis is 20 by default. It is best not to set 20
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16  # Tag font size
                    )),

                yaxis_opts=opts.AxisOpts(
                    name='quantity',
                    name_location='middle',
                    name_gap=30,
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16
                        # font_weight='bolder',
                    )),
                # toolbox_opts=opts.ToolboxOpts() # Tool options
            )
        )
    line.render(f"./cnn_html/month_key/{k}.html")

In [26]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis(y_time)
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            datazoom_opts=opts.DataZoomOpts(),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  # # The distance between the label and the axis is 20 by default. It is best not to set 20
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  # Tag font size
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() # Tool options
        )
    )

for ke in key:
    num = {}
    for i in ['2019/1', '2019/2', '2019/3', '2019/4', '2019/5', '2019/6', '2019/7', '2019/8', '2019/9','2019/10', '2019/11', '2019/12', '2020/1', '2020/2', '2020/3', '2020/4', '2020/5', '2020/6', '2020/7', '2020/8', '2020/9','2020/10', '2020/11', '2020/12', '2021/1',  '2021/2', '2021/3', '2021/4', '2021/5', '2021/6', '2021/7', '2021/8', '2021/9', '2021/10', '2021/11', '2021/12','2022/1',  '2022/2', '2022/3', '2022/4', '2022/5', '2022/6', '2022/7', '2022/8', '2022/9','2022/10', '2022/11', '2022/12',]:
        num[i] = 0
    temp = df[df['keyword']==ke].groupby('date')['keyword'].count().sort_index()
    for k,v in temp.to_dict().items():
        if k in num:
            num[k] = v

    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./cnn_html/month_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_html/month_key/5_key.html'

# Fox

### Yearly （2005-2022）

In [27]:
#2005-2022, yearly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [28]:
df = pd.read_csv("./data/fox.csv")
df.dropna(inplace=True)
df['date'] = pd.to_datetime(df['date']).dt.year

In [29]:
df['date'].value_counts()

2015    2636
2022    1136
2019     951
2021     928
2020     770
2017     657
2016     624
2018     573
2014     486
2013     426
2012     376
2011     325
Name: date, dtype: int64

In [30]:
#headline sentiment analysis
headline = {}
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date']==year]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    headline[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    # In 2010, there were only 2 pieces of data, cause an error, so start from 2011
    box_plot.add_xaxis(xaxis_data=list(headline.keys())[1:])
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="headline Emotional distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/year_emotion/headline.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/year_emotion/headline.html'

In [31]:
body_1 = {} # Holistic analysis
body_2 = {} # Mean-taking analysis
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date']==year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    body_1[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Holistic analysis distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/year_emotion/body_1.html")
for year in df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num) / len(num), 2))

    body_2[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Mean-taking analysis"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/year_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/year_emotion/body_2.html'

In [32]:
#clustering
#statisitcs and data visulization
#results interpretation

In [33]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature word TF-IDF matrix
    X = tf_idf.toarray()

    # The number of topics selected
    n_topics = 5
    # The number of first n_top_words headings for each topic to be output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA the resulting TF-IDF matrix
    lda.fit(tf_idf)


    # Use pyLDAvis for visualization
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './fox_html/year_lda/lda_{}.html'.format(year))
    except:
        print(year)

In [34]:
# count the number of times each of the 20 topics appeared and plot them by year

In [35]:
df['keyword'].value_counts()

Extreme weather                            1249
Climate change                             1094
Sea level rise                              958
Renewable energy                            950
Paris Agreement                             870
Greenhouse gases                            798
Environmental crisis                        768
Environmental policy                        653
Sustainable development                     650
Carbon emissions                            645
Climate advocacy                            613
Climate science                             179
Climate adaptation                          158
Climate policy                               86
Biodiversity loss                            82
Climate action                               74
COP26 (or other UN climate conferences)      54
COP15                                         7
Name: keyword, dtype: int64

In [36]:
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword']==k].groupby('date')['keyword'].count().sort_index()
    line = (
            Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
            .add_xaxis(temp.index.astype('str').tolist())
            .add_yaxis(k, temp.values.tolist(), is_smooth=True)
            .set_series_opts(
                areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
                label_opts=opts.LabelOpts(is_show=False),
            )
            .set_global_opts(
                title_opts=opts.TitleOpts(title="quantity distribution"),
                xaxis_opts=opts.AxisOpts(
                    axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                    is_scale=False,
                    boundary_gap=False,
                    name='time',
                    name_location='middle',
                    name_gap=30,  
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16  
                    )),

                yaxis_opts=opts.AxisOpts(
                    name='quantity',
                    name_location='middle',
                    name_gap=30,
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16
                        # font_weight='bolder',
                    )),
                # toolbox_opts=opts.ToolboxOpts() 
            )
        )
    line.render(f"./fox_html/year_key/{k}.html")

In [37]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis([str(i) for i in range(2015,2023)])
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() 
        )
    )

for ke in key:
    num = {}
    for i in range(2015,2023):
        num[i] = 0
    temp = df[df['keyword']==ke].groupby('date')['keyword'].count().sort_index()
    for k,v in temp.to_dict().items():
        if k in num:
            num[k] = v

    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./fox_html/year_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/year_key/5_key.html'

### Monthly （2019， 2020， 2021， 2022）

In [38]:
#2019-2022, monthly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [39]:
df = pd.read_csv("./data/fox.csv")
df.dropna(inplace=True)
df['year'] =  pd.to_datetime(df['date']).dt.year
df = df[df['year']>=2019]
df['date'] = pd.to_datetime(df['date']).dt.year.astype(str) + '-' + pd.to_datetime(df['date']).dt.month.astype(str)
y_time = ['2019-1', '2019-2', '2019-3', '2019-4', '2019-5', '2019-6', '2019-7', '2019-8', '2019-9', '2019-10',
          '2019-11', '2019-12', '2020-1', '2020-2', '2020-3', '2020-4', '2020-5', '2020-6', '2020-7', '2020-8',
          '2020-9', '2020-10', '2020-11', '2020-12', '2021-1', '2021-2', '2021-3', '2021-4', '2021-5', '2021-6',
          '2021-7', '2021-8', '2021-9', '2021-10', '2021-11', '2021-12', '2022-1', '2022-2', '2022-3', '2022-4',
          '2022-5', '2022-6', '2022-7', '2022-8', '2022-9', '2022-10', '2022-11', '2022-12', ]


In [40]:
headline = {}
for y in y_time:
    temp = []
    for each in df[df['date'] == y]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        headline[str(y)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(headline.keys()))
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="headline Emotional distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/month_emotion/headline.html")


'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/month_emotion/headline.html'

In [41]:
body_1 = {} # Holistic analysis
body_2 = {} # Mean-taking analysis
for year in  y_time:
    temp = []
    for each in df[df['date']==year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        body_1[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="Holistic analysis distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/month_emotion/body_1.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/month_emotion/body_1.html'

In [42]:
for year in y_time:
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num) / len(num), 2))
    if len(temp) > 5:
        body_2[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Mean-taking analysis"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./fox_html/month_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/month_emotion/body_2.html'

In [43]:
#clustering
#statisitcs and data visulization
#results interpretation

In [44]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  y_time:
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature words TF-IDF matrix
    X = tf_idf.toarray()

    # Selected Topics
    n_topics = 5
    # The number of top n_top_words topic words of each topic to output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA with generated TF-IDF matrix
    lda.fit(tf_idf)


    # Visualization with pyLDAvis
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './fox_html/month_lda/lda_{}.html'.format(year.replace('/','-')))
    except:
        print(year)

In [45]:
df['keyword'].value_counts()

Extreme weather                            607
Climate change                             472
Environmental crisis                       383
Sea level rise                             379
Paris Agreement                            321
Renewable energy                           298
Environmental policy                       260
Greenhouse gases                           234
Carbon emissions                           227
Climate advocacy                           214
Sustainable development                    148
Climate adaptation                          64
COP26 (or other UN climate conferences)     46
Climate policy                              40
Climate action                              33
Biodiversity loss                           31
Climate science                             24
COP15                                        4
Name: keyword, dtype: int64

In [46]:
#Count the number of occurrences of each of the 20 themes, and then draw a picture on a monthly basis
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword']==k].groupby('date')['keyword'].count().sort_index()
    line = (
            Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
            .add_xaxis(temp.index.astype('str').tolist())
            .add_yaxis(k, temp.values.tolist(), is_smooth=True)
            .set_series_opts(
                areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
                label_opts=opts.LabelOpts(is_show=False),
            )
            .set_global_opts(
                title_opts=opts.TitleOpts(title="quantity distribution"),
                datazoom_opts=opts.DataZoomOpts(),
                xaxis_opts=opts.AxisOpts(
                    axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                    is_scale=False,
                    boundary_gap=False,
                    name='time',
                    name_location='middle',
                    name_gap=30,  
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16  
                    )),

                yaxis_opts=opts.AxisOpts(
                    name='quantity ',
                    name_location='middle',
                    name_gap=30,
                    name_textstyle_opts=opts.TextStyleOpts(
                        font_family='Times New Roman',
                        font_size=16
                        # font_weight='bolder',
                    )),
                # toolbox_opts=opts.ToolboxOpts() 
            )
        )
    line.render(f"./fox_html/month_key/{k}.html")

In [47]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis(y_time)
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            datazoom_opts=opts.DataZoomOpts(),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity ',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() 
        )
    )

for ke in key:
    num = {}
    for i in ['2019/1', '2019/2', '2019/3', '2019/4', '2019/5', '2019/6', '2019/7', '2019/8', '2019/9','2019/10', '2019/11', '2019/12', '2020/1', '2020/2', '2020/3', '2020/4', '2020/5', '2020/6', '2020/7', '2020/8', '2020/9','2020/10', '2020/11', '2020/12', '2021/1',  '2021/2', '2021/3', '2021/4', '2021/5', '2021/6', '2021/7', '2021/8', '2021/9', '2021/10', '2021/11', '2021/12','2022/1',  '2022/2', '2022/3', '2022/4', '2022/5', '2022/6', '2022/7', '2022/8', '2022/9','2022/10', '2022/11', '2022/12',]:
        num[i] = 0
    temp = df[df['keyword']==ke].groupby('date')['keyword'].count().sort_index()
    for k,v in temp.to_dict().items():
        if k in num:
            num[k] = v

    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./fox_html/month_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/fox_html/month_key/5_key.html'

# CNN and Fox

### Yearly （2005-2022）

In [48]:
#2005-2022, yearly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [49]:
df1 = pd.read_csv("./data/news_cnn.csv")
df2 = pd.read_csv("./data/fox_balanced.csv")
df = pd.concat([df1,df2])
df.dropna(inplace=True)
df['date'] = pd.to_datetime(df['date']).dt.year

In [50]:
df['date'].value_counts()#remove 2010

2015    6264
2021    5354
2022    5217
2019    4998
2020    4202
2017    3370
2018    2971
2016    2387
2014    1512
2013    1388
2012    1292
2011     810
2010       2
Name: date, dtype: int64

In [51]:
df = df[df['date']!=2010]

In [52]:
#headline sentiment analysis
headline = {}
for year in df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date'] == year]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    headline[str(year)] = temp
box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
 
    box_plot.add_xaxis(xaxis_data=list(headline.keys())[1:])
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="headline Emotional distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
) 

box_plot.render(r"./cnn_fox_html/year_emotion/headline.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/year_emotion/headline.html'

In [53]:
body_1 = {}  # Holistic analysis
body_2 = {}  # Mean-taking analysis
for year in df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)

    body_1[str(year)] = temp
box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Holistic analysis distribution"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./cnn_fox_html/year_emotion/body_1.html")
for year in df['date'].value_counts().sort_index().index.tolist():
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num) / len(num), 2))

    body_2[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys())[1:])
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())[1:]))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Mean-taking analysis"
        ),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./cnn_fox_html/year_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/year_emotion/body_2.html'

In [54]:
#clustering
#statisitcs and data visulization
#results interpretation

In [55]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  df['date'].value_counts().sort_index().index.tolist():
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature words TF-IDF matrix
    X = tf_idf.toarray()

    # Selected Topics
    n_topics = 5
    # The number of top n_top_words topic words of each topic to output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA with generated TF-IDF matrix
    lda.fit(tf_idf)


    # Visualization with pyLDAvis
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './cnn_fox_html/year_lda/lda_{}.html'.format(year))
    except:
        print(year)

In [56]:
#Count the number of occurrences of each of the 20 themes, and then draw a picture by year
df['keyword'].value_counts()

Climate change                             11864
Extreme weather                             4343
Renewable energy                            2745
Environmental crisis                        2599
Paris Agreement                             2579
Sea level rise                              2322
Carbon emissions                            2232
Environmental policy                        2173
Sustainable development                     2142
Greenhouse gases                            1824
Climate advocacy                            1284
Climate action                              1080
Climate policy                              1054
Climate science                              799
Climate adaptation                           372
Biodiversity loss                            230
COP26 (or other UN climate conferences)      108
COP15                                         15
Name: keyword, dtype: int64

In [57]:
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword'] == k].groupby('date')['keyword'].count().sort_index()
    line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis(temp.index.astype('str').tolist())
        .add_yaxis(k, temp.values.tolist(), is_smooth=True)
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity ',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() 
        )
    )
    line.render(f"./cnn_fox_html/year_key/{k}.html")



In [58]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
    Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
    .add_xaxis(temp.index.astype('str').tolist())
    .set_series_opts(
        areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
        label_opts=opts.LabelOpts(is_show=False),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="quantity distribution"),
        xaxis_opts=opts.AxisOpts(
            axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
            is_scale=False,
            boundary_gap=False,
            name='time',
            name_location='middle',
            name_gap=30,  
            name_textstyle_opts=opts.TextStyleOpts(
                font_family='Times New Roman',
                font_size=16  
            )),

        yaxis_opts=opts.AxisOpts(
            name='quantity ',
            name_location='middle',
            name_gap=30,
            name_textstyle_opts=opts.TextStyleOpts(
                font_family='Times New Roman',
                font_size=16
                # font_weight='bolder',
            )),
        # toolbox_opts=opts.ToolboxOpts() 
    )
)

for ke in key:
    num = {}
    for i in range(2011, 2023):
        num[i] = 0

    temp = df[df['keyword'] == ke].groupby('date')['keyword'].count().sort_index()

    for k, v in temp.to_dict().items():
        if k in num:
            num[k] = v


    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./cnn_fox_html/year_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/year_key/5_key.html'

### Monthly （2019， 2020， 2021， 2022）

In [59]:
#2019-2022, monthly
#sentiment analysis
#statisitcs and data visulization
#results interpretation

In [60]:
df1 = pd.read_csv("./data/news_cnn.csv")
df2 = pd.read_csv("./data/fox_balanced.csv")
df = pd.concat([df1,df2])
df.dropna(inplace=True)
df['year'] = pd.to_datetime(df['date']).dt.year
df = df[df['year'] >= 2019]
df['date'] = pd.to_datetime(df['date']).dt.year.astype(str) + '/' + pd.to_datetime(df['date']).dt.month.astype(str)
y_time = ['2019/1', '2019/2', '2019/3', '2019/4', '2019/5', '2019/6', '2019/7', '2019/8', '2019/9', '2019/10',
          '2019/11', '2019/12', '2020/1', '2020/2', '2020/3', '2020/4', '2020/5', '2020/6', '2020/7', '2020/8',
          '2020/9', '2020/10', '2020/11', '2020/12', '2021/1', '2021/2', '2021/3', '2021/4', '2021/5', '2021/6',
          '2021/7', '2021/8', '2021/9', '2021/10', '2021/11', '2021/12', '2022/1', '2022/2', '2022/3', '2022/4',
          '2022/5', '2022/6', '2022/7', '2022/8', '2022/9', '2022/10', '2022/11', '2022/12', ]

In [61]:
headline = {}
for y in  y_time:
    temp = []
    for each in df[df['date']==y]['headline'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        headline[str(y)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    
    box_plot.add_xaxis(xaxis_data=list(headline.keys()))
    .add_yaxis(series_name="headline", y_axis=box_plot.prepare_data(list(headline.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
             title="headline Emotional distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./cnn_fox_html/month_emotion/headline.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/month_emotion/headline.html'

In [62]:
body_1 = {}  # Holistic analysis
body_2 = {}  # Mean-taking analysis
for year in y_time:
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        try:
            temp.append(TextBlob(each).sentiment[0])
        except:
            print(each)
    if len(temp) > 5:
        body_1[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_1.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_1.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Holistic analysis distribution"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./cnn_fox_html/month_emotion/body_1.html")
for year in y_time:
    temp = []
    for each in df[df['date'] == year]['body'].values.tolist():
        num = []
        for e in each.split('.'):
            try:
                num.append(TextBlob(e).sentiment[0])
            except:
                print(each)
        temp.append(round(sum(num) / len(num), 2))
    if len(temp) > 5:
        body_2[str(year)] = temp

box_plot = Boxplot(init_opts=opts.InitOpts(width="1200px", height="730px"))

box_plot = (
    box_plot.add_xaxis(xaxis_data=list(body_2.keys()))
    .add_yaxis(series_name="body", y_axis=box_plot.prepare_data(list(body_2.values())))
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="Mean-taking analysis"
        ),
        datazoom_opts=opts.DataZoomOpts(),
        tooltip_opts=opts.TooltipOpts(trigger="item", axis_pointer_type="shadow"),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            boundary_gap=True,
            splitarea_opts=opts.SplitAreaOpts(is_show=False),
            axislabel_opts=opts.LabelOpts(formatter="{value}"),
            splitline_opts=opts.SplitLineOpts(is_show=False),
        ),
        yaxis_opts=opts.AxisOpts(
            type_="value",
            name="",
            splitarea_opts=opts.SplitAreaOpts(
                is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
            ),
        ),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{a}: {c}"))
)  

box_plot.render(r"./cnn_fox_html/month_emotion/body_2.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/month_emotion/body_2.html'

In [63]:
#clustering
#statisitcs and data visulization
#results interpretation

In [64]:
stopwords_file = open('data//stopwords.txt', 'r', encoding='utf-8')
stopwords = [words.strip() for words in stopwords_file.readlines()]
stopwords.append(' ')
for year in  y_time:
    temp = df[df['date']==year]['body'].values.tolist()
    wordslist = []

    for v in temp:
        seg_list_after = []
        seg_list = nltk.word_tokenize(v)
        for seg in seg_list:
            if seg not in stopwords:
                seg_list_after.append(seg)
        temp = ' '.join(seg_list_after)
        wordslist.append(temp)

    tf_idf_vectorizer = TfidfVectorizer()
    tf_idf = tf_idf_vectorizer.fit_transform(wordslist)


    # Feature words TF-IDF matrix
    X = tf_idf.toarray()

    # Selected Topics
    n_topics = 5
    # The number of top n_top_words topic words of each topic to output
    n_top_words = 20

    lda = LatentDirichletAllocation(
        n_components=n_topics, max_iter=50,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    # Core, feed LDA with generated TF-IDF matrix
    lda.fit(tf_idf)


    # Visualization with pyLDAvis
    data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer,mds='mmds')
    try:
        pyLDAvis.save_html(data, './cnn_fox_html/month_lda/lda_{}.html'.format(year.replace('/','-')))
    except:
        print(year)

In [65]:
# 20 topics count the number of times each topic appears, and then draw the picture by month

In [66]:
df['keyword'].value_counts()

Climate change                             8182
Extreme weather                            2318
Environmental crisis                       1425
Sea level rise                             1014
Paris Agreement                             966
Carbon emissions                            933
Renewable energy                            929
Environmental policy                        868
Sustainable development                     671
Greenhouse gases                            533
Climate action                              474
Climate advocacy                            449
Climate policy                              426
Climate science                             232
Climate adaptation                          147
Biodiversity loss                           103
COP26 (or other UN climate conferences)      92
COP15                                         9
Name: keyword, dtype: int64

In [67]:
key = df['keyword'].value_counts().index.tolist()[:-1]
count = 0
for k in key:
    temp = df[df['keyword'] == k].groupby('date')['keyword'].count().sort_index()
    line = (
        Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
        .add_xaxis(temp.index.astype('str').tolist())
        .add_yaxis(k, temp.values.tolist(), is_smooth=True)
        .set_series_opts(
            areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
            label_opts=opts.LabelOpts(is_show=False),
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="quantity distribution"),
            datazoom_opts=opts.DataZoomOpts(),
            xaxis_opts=opts.AxisOpts(
                axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                is_scale=False,
                boundary_gap=False,
                name='time',
                name_location='middle',
                name_gap=30,  
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16  
                )),

            yaxis_opts=opts.AxisOpts(
                name='quantity ',
                name_location='middle',
                name_gap=30,
                name_textstyle_opts=opts.TextStyleOpts(
                    font_family='Times New Roman',
                    font_size=16
                    # font_weight='bolder',
                )),
            # toolbox_opts=opts.ToolboxOpts() 
        )
    )
    line.render(f"./cnn_fox_html/month_key/{k}.html")


In [68]:
key = df['keyword'].value_counts().index.tolist()[:5]
line = (
    Line(init_opts=opts.InitOpts(width="1230px", height="730px", theme=ThemeType.MACARONS))
    .add_xaxis(y_time)
    .set_series_opts(
        areastyle_opts=opts.AreaStyleOpts(opacity=0.5),
        label_opts=opts.LabelOpts(is_show=False),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="quantity distribution"),
        datazoom_opts=opts.DataZoomOpts(),
        xaxis_opts=opts.AxisOpts(
            axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
            is_scale=False,
            boundary_gap=False,
            name='time',
            name_location='middle',
            name_gap=30,  
            name_textstyle_opts=opts.TextStyleOpts(
                font_family='Times New Roman',
                font_size=16  
            )),

        yaxis_opts=opts.AxisOpts(
            name='quantity ',
            name_location='middle',
            name_gap=30,
            name_textstyle_opts=opts.TextStyleOpts(
                font_family='Times New Roman',
                font_size=16
                # font_weight='bolder',
            )),
        # toolbox_opts=opts.ToolboxOpts() 
    )
)

for ke in key:
    num = {}
    for i in ['2019/1', '2019/2', '2019/3', '2019/4', '2019/5', '2019/6', '2019/7', '2019/8', '2019/9', '2019/10',
              '2019/11', '2019/12', '2020/1', '2020/2', '2020/3', '2020/4', '2020/5', '2020/6', '2020/7', '2020/8',
              '2020/9', '2020/10', '2020/11', '2020/12', '2021/1', '2021/2', '2021/3', '2021/4', '2021/5', '2021/6',
              '2021/7', '2021/8', '2021/9', '2021/10', '2021/11', '2021/12', '2022/1', '2022/2', '2022/3', '2022/4',
              '2022/5', '2022/6', '2022/7', '2022/8', '2022/9', '2022/10', '2022/11', '2022/12', ]:
        num[i] = 0
    temp = df[df['keyword'] == ke].groupby('date')['keyword'].count().sort_index()
    for k, v in temp.to_dict().items():
        if k in num:
            num[k] = v

    line.add_yaxis(ke, list(num.values()), is_smooth=True)

line.render(f"./cnn_fox_html/month_key/5_key.html")

'/Users/chunyangzhang/Desktop/macs30200/proj/code1.4/cnn_fox_html/month_key/5_key.html'