# MSRC Tags to File Version Info

In [None]:
import difflib
import requests
import pandas as pd
import requests
import json
import matplotlib

from cvedata.msrc_pandas import get_msrc_tags, get_msrc_titles, get_msrc_cvrf_pandas_df, MSRC_CVRF_PANDAS_FULL,MSRC_CVRF_PANDAS, get_msrc_tags_freq
from cvedata.win_verinfo import get_verinfo_desc_to_bins_json
from cvedata.winbindex import get_winbindex_desc_to_bin_map

from cvedata.msrc_cve_to_bins import clean_tag

In [None]:
tags = get_msrc_tags_freq()
titles = get_msrc_titles()

win10_sys32_ver = get_verinfo_desc_to_bins_json()
wb_ver = get_winbindex_desc_to_bin_map()

In [None]:
bin_names = []

for key in wb_ver:
    for bin in wb_ver[key]:
        bin_names.append(bin)

for key in win10_sys32_ver:
    for bin in win10_sys32_ver[key]:
        bin_names.append(bin)

bin_names = list(set(bin_names))

all_bin_names = {}

for bin in bin_names:
    all_bin_names[bin] = [bin]

len(all_bin_names)

In [None]:
print(len(win10_sys32_ver))
print(len(wb_ver))

In [None]:
tags_df = pd.DataFrame({'tag count' : tags}).reset_index()
tags_df.rename(columns={'index': 'tag'},inplace=True)
tags_df.sort_values(by=['tag count'], ascending=False)

In [None]:
def get_match_at_cutoff(key,possibilities: dict,cutoff: float = 0.6):

    if key is None:
        return []

    key = clean_tag(key)
    matches = difflib.get_close_matches(key,possibilities.keys(),n=10000,cutoff=cutoff)

    # map keys to bins
    bins = [bin for desc in matches for bin in possibilities[desc]]
    return bins



In [None]:
len(win10_sys32_ver)

In [None]:
tags_df['win10_sys32_ver'] = tags_df['tag'].apply(get_match_at_cutoff, args=(win10_sys32_ver,))
tags_df[tags_df['win10_sys32_ver'].apply(lambda x: len(x) > 0)]

In [None]:
# wb_ver
tags_df['wb_ver'] = tags_df['tag'].apply(get_match_at_cutoff, args=(wb_ver,))
tags_df[tags_df['wb_ver'].apply(lambda x: len(x) > 0)]

In [None]:
# all_bin_names
tags_df['all_bin_names'] = tags_df['tag'].apply(get_match_at_cutoff, args=(all_bin_names,.75))
tags_df[tags_df['all_bin_names'].apply(lambda x: len(x) > 0)]

In [None]:
# files unique to ver_info
tags_df[tags_df['win10_sys32_ver'].apply(lambda x: len(x) > 0) & tags_df['wb_ver'].apply(lambda x: len(x) == 0) & tags_df['all_bin_names'].apply(lambda x: len(x) == 0) ]

In [None]:
# files unique to winbindex
tags_df[tags_df['wb_ver'].apply(lambda x: len(x) > 0) & tags_df['win10_sys32_ver'].apply(lambda x: len(x) == 0) & tags_df['all_bin_names'].apply(lambda x: len(x) == 0)]

In [None]:
# files unique to bin names
tags_df[tags_df['all_bin_names'].apply(lambda x: len(x) > 0) & tags_df['wb_ver'].apply(lambda x: len(x) == 0) & tags_df['win10_sys32_ver'].apply(lambda x: len(x) == 0)]

In [None]:
# how many tags did we match based on bin name alone?
tags_df[tags_df['all_bin_names'].str.len() > 0]