# Mapping MSRC CVEs to Bins

Is it possible to map [MSRC CVEs](https://msrc.microsoft.com/update-guide/vulnerability) to Windows binaries?


```{mermaid}

graph TD
    A[winbindex fa:fa-database] --> B[KBs]
    C[ms support feeds fa:fa-database] -->B
    B --> D[msrc_cve_to_bins.py]
    E[File Version Info fa:fa-file] --> D
    F[Known MSRC Tags fa:fa-tag] --> D
    D --> G(cve to bin fa:fa-map)

```

## Import cvedata 

In [None]:
from cvedata.winbindex import get_winbindex_desc_to_bin_map, WINBINDEX_GITHUB_URL, get_winbindex_kbs_to_bin_map
from cvedata.win_verinfo import get_verinfo_desc_to_bins_json
from cvedata.msrc_pandas import get_msrc_tags, get_msrc_titles, get_msrc_cvrf_pandas_df, MSRC_CVRF_PANDAS_FULL,MSRC_CVRF_PANDAS
from cvedata.msrc_known_bins import KNOWN_TAG_TO_BIN_MAP
from cvedata.ms_feed_kbs import get_ms_kb_feeds_json,get_ms_kb_feed_files_json,get_ms_kb_to_bins_json


from cvedata.msrc_pandas import clean_impact
from cvedata.msrc_cve_to_bins import clean_tag

pd.set_option('display.max_rows', 15)



In [None]:
import pandas as pd
import difflib
import json

pd.set_option('display.max_rows', 20)

In [None]:
all_cvrf_df = pd.DataFrame.from_dict(get_msrc_cvrf_pandas_df())

all_cvrf_df.head()

In [None]:
tags_json = get_msrc_tags()
titles_json = get_msrc_titles()
kbs_to_bins_json = get_winbindex_kbs_to_bin_map()
verinfo_desc_to_bins = get_verinfo_desc_to_bins_json()

In [None]:
#from cvedata.msrc_cve_to_bins import get_tag_similarity_df_df

   
def get_tag_similarity_df(row,key,desc_to_bins,col_pre,min_sims):
    """
    Builds similarity columns into Dataframe at min_sims intervals
    """

    bins = {}

    # init bins
    for min_sim in min_sims:
            bins.setdefault(min_sim,[])

    ctag1 = clean_tag(row[key]).split()

    for desc in desc_to_bins:
        
        ctag2 = clean_tag(desc).split()

        if len(ctag1) == 0 or len(ctag2) == 0:
            break
        else:        
            sim = difflib.SequenceMatcher(None,ctag1,ctag2).ratio()

        # add bins to 
        for min_sim in min_sims:                      
            if sim >= min_sim:
                [bins[min_sim].append(bin) for bin in desc_to_bins[desc]]

    for sim_score in bins:
        row[f"{col_pre}-{sim_score}"] = bins[sim_score]

    return row




In [None]:
key = 'Tag'
tags_sim_df = pd.DataFrame(all_cvrf_df['Tag'].unique(),columns= ['Tag'])
# tags_sim_wb_df = tags_sim_df.apply(get_tag_similarity_df,args=(key,wb_desc_to_bins,'wb',[.25, .45, .5, .55, .75, .9, 1]),axis=1)
# tags_sim_wb_df.head(20)
tags_sim_df




In [None]:
tags_sim_df.Tag.str.split(expand=True).stack().value_counts()

In [None]:
ver_df = pd.json_normalize(verinfo_desc_to_bins)
ver_df = ver_df.swapaxes('index', 'columns')
ver_df.index.name = 'desc'
ver_df.reset_index()

ver_df = pd.DataFrame(ver_df.reset_index()['desc'].unique(),columns= ['desc'])
ver_df.desc.str.split(expand=True).stack().value_counts()

In [None]:
#tags_sim_df['wb_bins'] = tags_sim_df['Tag'].apply(lambda x : difflib.get_close_matches(clean_tag(x),wb_desc_to_bins.keys(),n=10000,cutoff=.6))

In [None]:
tags_sim_df

In [None]:
#tags_sim_df[['kb_bins', 'wb_bins']].apply(lambda x: len(x)).value_counts()

In [None]:
tags_sim_verinfo_df = tags_sim_df.apply(get_tag_similarity_df,args=(key,verinfo_desc_to_bins,'vi',[.25, .45, .5, .55, .75, .9, 1]),axis=1)
tags_sim_verinfo_df.head(25)


In [None]:
key = 'Title'
titles_sim_df = pd.DataFrame(all_cvrf_df['Title'].unique(),columns= ['Title'])
# titles_sim_wb_df = titles_sim_df.apply(get_tag_similarity_df,args=(key,wb_desc_to_bins,'wb',[.25, .45, .5, .55, .75, .9, 1]),axis=1)
# titles_sim_wb_df.head(25)
titles_sim_df

In [None]:
titles_sim_verinfo_df = titles_sim_df.apply(get_tag_similarity_df,args=(key,verinfo_desc_to_bins,'vi',[.25, .45, .5, .55, .75, .9, 1]),axis=1)
titles_sim_verinfo_df.head(25)

## What is the number of bins on average for each percentile?

In [None]:
tags_sim_verinfo_df.apply(lambda s: s.map(lambda x: len(x) if x else 0)).mean() 

In [None]:
titles_sim_verinfo_df.apply(lambda s: s.map(lambda x: len(x))).mean() 

In [None]:
tags_sim_verinfo_df.apply(lambda s: s.map(lambda x: len(x) if x else 0)).mean()

In [None]:
tags_sim_map = tags_sim_verinfo_df.copy()
tags_sim_map.set_index('Tag',inplace=True)

#tags_sim_map = tags_sim_map.apply(lambda s: s.map(lambda x: len(x) if x else None))
#tags_sim_map.sort_values(by=['vi-1'], ascending=False)
#tags_sim_map['vi-1'].sort_values(ascending=False)
tag_counts_df = tags_sim_map.apply(lambda s: s.map(lambda x: len(x) if x else 0))

for col in tag_counts_df.columns:
    print(col)
    print(tag_counts_df[tag_counts_df[col] > 0].count()[col])

In [None]:
titles_sim_map = titles_sim_verinfo_df.copy()
titles_sim_map.set_index('Title',inplace=True)
titles_sim_map.to_json('titles.json')
title_counts_df = titles_sim_map.apply(lambda s: s.map(lambda x: len(x) if x else 0))

for col in title_counts_df.columns:
    print(col)
    print(title_counts_df[title_counts_df[col] > 0].count()[col])

In [None]:
kb_feed_df = None
kb_feed = get_ms_kb_to_bins_json()['updated']
kb_feed

In [None]:


def cve_to_bin(row):
    print(row.name)

    print(row['Tag'])
    print(row['Title'])

    cve = row.name
    tag = row['Tag']
    title = row['Title']

    # for col in test.columns:
    #     print(test.loc[tag])
    #     row[col] = test.loc[tag]
    #print(test['vi-0.55'].loc[tag])

    bins = set()
    
    # print(tags_sim_map['vi-0.55'].loc[tag])
    # print(titles_sim_map['vi-0.55'].loc[title])
    # print("after titles")

    if tag and tag.lower() in KNOWN_TAG_TO_BIN_MAP:
        for bin in KNOWN_TAG_TO_BIN_MAP[tag.lower()]:
            bins.add(bin)


    for bin in tags_sim_map['vi-0.55'].loc[tag]:
        bins.add(bin)

    for bin in titles_sim_map['vi-0.45'].loc[title]:
        bins.add(bin)

    # updated_bins
    
    updated_bins = []
    for kb in row['KBs'].split():        
        kb_updated_files = kb_feed.get(kb)
        if kb_updated_files:
            updated_bins.extend(kb_updated_files)
            #print(updated_bins)


    row['Bins Updated'] = list(set(updated_bins).intersection(bins))
    row['Bins'] = list(bins)

    return row

print(all_cvrf_df.columns)

bins_all_cvrf_df = all_cvrf_df.apply(cve_to_bin,axis=1)
bins_all_cvrf_df

In [None]:
for key in tags_sim_verinfo_df.keys():
    print(key)

tags_sim_verinfo_df[tags_sim_verinfo_df['Tag'] == 'Microsoft Office'] 

In [None]:
bins_all_cvrf_df.sort_values(by=['Bins'], ascending=False)

In [None]:
# How man CVEs have bins assigned?
bins_all_cvrf_df['Bins Count'] = bins_all_cvrf_df['Bins'].apply(lambda x: len(x) if x else 0)
bins_all_cvrf_df['Bins Updated Count'] = bins_all_cvrf_df['Bins Updated'].apply(lambda x: len(x) if x else 0)


In [None]:
bins_all_cvrf_df[bins_all_cvrf_df['Bins Count'] > 0]

In [None]:
bins_all_cvrf_df[bins_all_cvrf_df['Bins Updated Count'] > 0]

In [None]:
# How many from specific update?

bins_all_cvrf_df[bins_all_cvrf_df['Initial Release'].str.contains('2022-11') & bins_all_cvrf_df['Bins Updated Count'] > 0]

In [None]:
#for all updates, what % have assigned bins?
updates = bins_all_cvrf_df['Initial Release'].unique()

results = {}

for update in updates:
    results.setdefault(update,{})
    update_df = bins_all_cvrf_df[bins_all_cvrf_df['Initial Release'].str.contains(update)]
    cve_per_update = update_df.shape[0]
    cves_with_updated_bins = update_df[update_df['Bins Updated Count'] > 0]
    cves_without_updated_bins = update_df[update_df['Bins Updated Count'] == 0]
    cves_with_bins = update_df[update_df['Bins Count'] > 0]    
    cves_without_bins = update_df[(update_df['Bins Count'] == 0)]
    
    cve_with_bins_percent = (cves_with_bins.shape[0] * 100 / cve_per_update) 
    cve_without_bins_percent = (cves_without_bins.shape[0] * 100 / cve_per_update)
    
    results[update]['with_bins'] = cves_with_bins.shape[0]
    results[update]['without_bins'] = cves_without_bins.shape[0]
    results[update]['with_updated_bins'] = cves_with_updated_bins.shape[0]
    results[update]['without_updated_bins'] = cves_without_updated_bins.shape[0]

    

    #results[update]['total'] = update_df.shape[0]

#df = pd.DataFrame({'with': results.values() }, index=results.keys())
#df
#df = pd.DataFrame(results, index=results.keys()).stack().reset_index()
# df = pd.DataFrame.from_dict(results, orient='index').reset_index()

# df['index'] = pd.to_datetime(df['index'])
# #df.columns=['update', 'with', 'without']
# #df.set_index('update')
# df = df.set_index('index')
update_results_df = pd.DataFrame.from_dict(results, orient='index')
update_results_df.index = pd.to_datetime(update_results_df.index)
update_results_df = update_results_df.sort_index()
update_results_df


In [None]:
update_results_df.loc[update_results_df.index.year == 2022][['with_bins','without_bins']].plot.bar(figsize=(20,3), stacked=True)

In [None]:
update_results_df[['with_bins','without_bins']].plot.bar(figsize=(20,5), stacked=True)

In [None]:
update_results_df.loc[update_results_df.index.year == 2022][['with_updated_bins','without_updated_bins']].plot.bar(figsize=(20,3), stacked=True)

In [None]:
update_results_df[['with_updated_bins','without_updated_bins']].plot.bar(figsize=(20,5), stacked=True)

In [None]:
# what is the next best title to hardcode?

bins_all_cvrf_df[bins_all_cvrf_df['Bins Count'] == 0]['Title'].value_counts()

In [None]:
# what is the next best tag to hardcode?
bins_all_cvrf_df[bins_all_cvrf_df['Bins Count'] == 0]['Tag'].value_counts()

In [None]:
# what is the next best tag/title combo?
tags_df = bins_all_cvrf_df[bins_all_cvrf_df['Bins Count'] == 0]
#tags_df = df[['Tag','Title', 'Initial Release']]
tags_df = tags_df.loc[tags_df['Initial Release'].str.contains('2022')]
tags_title_df = tags_df.reset_index().groupby(['Tag','Title']).aggregate(set)
tags_title_df = tags_title_df.rename(columns={'index': 'CVEs'})
tags_title_df['CVE Count'] = tags_title_df['CVEs'].apply(lambda x: len(list(x)))
tags_title_df.sort_values(by=['CVE Count'], ascending=False)